mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	llama : multi-threaded quantization (#1075)
* Multi-threading quantization. Not much gain for simple quantizations, bit it will be important for quantizations that require more CPU cycles. * Multi-threading for quantize-stats It now does the job in ~14 seconds on my Mac for Q4_0, Q4_1 and Q4_2. Single-threaded it was taking more than 2 minutes after adding the more elaborate version of Q4_2. * Reviewer comments * Avoiding compiler confusion After changing chunk_size to const int as suggested by @ggerganov, clang and GCC starting to warn me that I don't need to capture it in the lambda. So, I removed it from the capture list. But that makes the MSVC build fail. So, making it a constexpr to make every compiler happy. * Still fighting with lambda captures in MSVC --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -15,6 +15,8 @@ | |||||||
| #include <string> | #include <string> | ||||||
| #include <unordered_map> | #include <unordered_map> | ||||||
| #include <vector> | #include <vector> | ||||||
|  | #include <thread> | ||||||
|  | #include <mutex> | ||||||
|  |  | ||||||
| struct quantize_stats_params { | struct quantize_stats_params { | ||||||
|     std::string model = "models/7B/ggml-model-f16.bin"; |     std::string model = "models/7B/ggml-model-f16.bin"; | ||||||
| @@ -27,7 +29,6 @@ struct quantize_stats_params { | |||||||
|     std::vector<enum ggml_type> include_types; |     std::vector<enum ggml_type> include_types; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| const int64_t SCRATCH_ELEMENTS = 32*32; |  | ||||||
| const size_t HISTOGRAM_BUCKETS = 150; | const size_t HISTOGRAM_BUCKETS = 150; | ||||||
| const double HISTOGRAM_RANGE = 0.03; | const double HISTOGRAM_RANGE = 0.03; | ||||||
|  |  | ||||||
| @@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou | |||||||
|     stats.num_samples += nelements; |     stats.num_samples += nelements; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void combine_error_stats(error_stats & into, const error_stats & from) { | ||||||
|  |     into.num_samples += from.num_samples; | ||||||
|  |     into.total_error += from.total_error; | ||||||
|  |     if (from.max_error > into.max_error) into.max_error = from.max_error; | ||||||
|  |     for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i]; | ||||||
|  | } | ||||||
|  |  | ||||||
| double find_quantile(const error_stats & stats, double quantile) { | double find_quantile(const error_stats & stats, double quantile) { | ||||||
|     double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); |     double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); | ||||||
|  |  | ||||||
| @@ -130,6 +138,36 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { | |||||||
|         tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; |         tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void test_roundtrip_on_chunk( | ||||||
|  |         const ggml_tensor * layer, | ||||||
|  |         int64_t offset, | ||||||
|  |         int64_t chunk_size, | ||||||
|  |         const quantize_fns_t & qfns, | ||||||
|  |         bool use_reference, | ||||||
|  |         float * input_scratch, | ||||||
|  |         char * quantized_scratch, | ||||||
|  |         float * output_scratch, | ||||||
|  |         error_stats & stats) { | ||||||
|  |  | ||||||
|  |     if (layer->type == GGML_TYPE_F16) { | ||||||
|  |         for (int i = 0; i < chunk_size; i++) { | ||||||
|  |             input_scratch[i] = ggml_get_f32_1d(layer, i + offset); | ||||||
|  |         } | ||||||
|  |     } else { | ||||||
|  |         input_scratch = ggml_get_data_f32(layer) + offset; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if (use_reference) { | ||||||
|  |         qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); | ||||||
|  |     } else { | ||||||
|  |         qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); | ||||||
|  |     } | ||||||
|  |     qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); | ||||||
|  |  | ||||||
|  |     update_error_stats(chunk_size, input_scratch, output_scratch, stats); | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
| // Run quantization function for a single layer and update error stats | // Run quantization function for a single layer and update error stats | ||||||
| void test_roundtrip_on_layer( | void test_roundtrip_on_layer( | ||||||
|         std::string & name, |         std::string & name, | ||||||
| @@ -137,40 +175,61 @@ void test_roundtrip_on_layer( | |||||||
|         const quantize_fns_t & qfns, |         const quantize_fns_t & qfns, | ||||||
|         bool use_reference, |         bool use_reference, | ||||||
|         const ggml_tensor * layer, |         const ggml_tensor * layer, | ||||||
|         float * input_scratch, |         std::vector<float> & input_scratch, | ||||||
|         char *quantized_scratch, |         std::vector<char> & quantized_scratch, | ||||||
|         float * output_scratch, |         std::vector<float> & output_scratch, | ||||||
|         error_stats & total_error) { |         error_stats & total_error, | ||||||
|  |         int max_thread = 0) { | ||||||
|  |  | ||||||
|     assert(tensor_is_contiguous(layer)); |     assert(tensor_is_contiguous(layer)); | ||||||
|     error_stats layer_error {}; |     error_stats layer_error {}; | ||||||
|     int64_t nelements = ggml_nelements(layer); |     uint64_t nelements = ggml_nelements(layer); | ||||||
|  |  | ||||||
|     for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) { |     float* input_scratch_ptr = nullptr; | ||||||
|         int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset); |     if (layer->type == GGML_TYPE_F16) { | ||||||
|  |         if (input_scratch.size() < nelements) input_scratch.resize(nelements); | ||||||
|         if (layer->type == GGML_TYPE_F16) { |         input_scratch_ptr = input_scratch.data(); | ||||||
|             for (int i = 0; i < chunk_size; i++) { |  | ||||||
|                 input_scratch[i] = ggml_get_f32_1d(layer, i + offset); |  | ||||||
|             } |  | ||||||
|         } else { |  | ||||||
|             input_scratch = ggml_get_data_f32(layer) + offset; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if (use_reference) { |  | ||||||
|             qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); |  | ||||||
|         } else { |  | ||||||
|             qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); |  | ||||||
|         } |  | ||||||
|         qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); |  | ||||||
|  |  | ||||||
|         update_error_stats(chunk_size, input_scratch, output_scratch, total_error); |  | ||||||
|         if (print_layer_stats) { |  | ||||||
|             update_error_stats(chunk_size, input_scratch, output_scratch, layer_error); |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |     if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements); | ||||||
|  |     if (output_scratch.size() < nelements) output_scratch.resize(nelements); | ||||||
|  |  | ||||||
|  |     if (max_thread < 1) max_thread = std::thread::hardware_concurrency(); | ||||||
|  |     int chunk_size = 32*512; | ||||||
|  |     int num_chunks = (nelements + chunk_size - 1)/chunk_size; | ||||||
|  |  | ||||||
|  |     if (num_chunks < 2 || max_thread < 2) { | ||||||
|  |         test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(), | ||||||
|  |                 output_scratch.data(), print_layer_stats ? layer_error : total_error); | ||||||
|  |     } else { | ||||||
|  |         auto & stats = print_layer_stats ? layer_error : total_error; | ||||||
|  |         std::mutex mutex; | ||||||
|  |         uint64_t counter = 0; | ||||||
|  |         auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr, | ||||||
|  |              &quantized_scratch, &output_scratch, chunk_size] () { | ||||||
|  |             error_stats local_stats {}; | ||||||
|  |             while (true) { | ||||||
|  |                 std::unique_lock<std::mutex> lock(mutex); | ||||||
|  |                 uint64_t offset = counter; counter += chunk_size; | ||||||
|  |                 if (offset >= nelements) { | ||||||
|  |                     combine_error_stats(stats, local_stats); | ||||||
|  |                     break; | ||||||
|  |                 } | ||||||
|  |                 lock.unlock(); | ||||||
|  |                 uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; | ||||||
|  |                 test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset, | ||||||
|  |                         quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |         int nthread = std::min(num_chunks, max_thread); | ||||||
|  |         std::vector<std::thread> workers(nthread-1); | ||||||
|  |         for (auto& w : workers) w = std::thread(compute); | ||||||
|  |         compute(); | ||||||
|  |         for (auto& w : workers) w.join(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     if (print_layer_stats) { |     if (print_layer_stats) { | ||||||
|         print_error_stats(name, layer_error, false); |         print_error_stats(name, layer_error, false); | ||||||
|  |         combine_error_stats(total_error, layer_error); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -181,6 +240,7 @@ int main(int argc, char ** argv) { | |||||||
|  |  | ||||||
|     // read command line |     // read command line | ||||||
|  |  | ||||||
|  |     int max_thread = 0; | ||||||
|     bool invalid_param = false; |     bool invalid_param = false; | ||||||
|     std::string arg; |     std::string arg; | ||||||
|     for (int i = 1; i < argc; i++) { |     for (int i = 1; i < argc; i++) { | ||||||
| @@ -230,6 +290,12 @@ int main(int argc, char ** argv) { | |||||||
|                 fprintf(stderr, "error: %s not in list of types\n", argv[i]); |                 fprintf(stderr, "error: %s not in list of types\n", argv[i]); | ||||||
|                 invalid_param = true; |                 invalid_param = true; | ||||||
|             } |             } | ||||||
|  |         } else if (arg == "-n" || arg == "--num-threads") { | ||||||
|  |             if (++i >= argc) { | ||||||
|  |                 invalid_param = true; | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|  |             max_thread = atoi(argv[i]); | ||||||
|         } else { |         } else { | ||||||
|             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); |             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); | ||||||
|             quantize_stats_print_usage(argc, argv); |             quantize_stats_print_usage(argc, argv); | ||||||
| @@ -295,9 +361,9 @@ int main(int argc, char ** argv) { | |||||||
|     } |     } | ||||||
|     printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); |     printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); | ||||||
|     // allocate scratch space |     // allocate scratch space | ||||||
|     std::vector<float> input_scratch(SCRATCH_ELEMENTS); |     std::vector<float> input_scratch; | ||||||
|     std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4); |     std::vector<char> quantized_scratch; | ||||||
|     std::vector<float> output_scratch(SCRATCH_ELEMENTS); |     std::vector<float> output_scratch; | ||||||
|  |  | ||||||
|     // loop throught quantization types |     // loop throught quantization types | ||||||
|     for (int i = 0; i < GGML_TYPE_COUNT; i++) { |     for (int i = 0; i < GGML_TYPE_COUNT; i++) { | ||||||
| @@ -328,10 +394,11 @@ int main(int argc, char ** argv) { | |||||||
|                         qfns, |                         qfns, | ||||||
|                         params.reference, |                         params.reference, | ||||||
|                         kv_tensor.second, |                         kv_tensor.second, | ||||||
|                         input_scratch.data(), |                         input_scratch, | ||||||
|                         quantized_scratch.data(), |                         quantized_scratch, | ||||||
|                         output_scratch.data(), |                         output_scratch, | ||||||
|                         global_stats |                         global_stats, | ||||||
|  |                         max_thread | ||||||
|                 ); |                 ); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -10,8 +10,8 @@ | |||||||
| int main(int argc, char ** argv) { | int main(int argc, char ** argv) { | ||||||
|     ggml_time_init(); |     ggml_time_init(); | ||||||
|  |  | ||||||
|     if (argc != 4) { |     if (argc < 4) { | ||||||
|         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); |         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]); | ||||||
|         fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); |         fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); | ||||||
|         fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); |         fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); | ||||||
|         fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); |         fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); | ||||||
| @@ -30,6 +30,7 @@ int main(int argc, char ** argv) { | |||||||
|     const std::string fname_out = argv[2]; |     const std::string fname_out = argv[2]; | ||||||
|  |  | ||||||
|     const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); |     const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); | ||||||
|  |     int nthread = argc > 4 ? atoi(argv[4]) : 0; | ||||||
|  |  | ||||||
|     const int64_t t_main_start_us = ggml_time_us(); |     const int64_t t_main_start_us = ggml_time_us(); | ||||||
|  |  | ||||||
| @@ -39,7 +40,7 @@ int main(int argc, char ** argv) { | |||||||
|     { |     { | ||||||
|         const int64_t t_start_us = ggml_time_us(); |         const int64_t t_start_us = ggml_time_us(); | ||||||
|  |  | ||||||
|         if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) { |         if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) { | ||||||
|             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); |             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); | ||||||
|             return 1; |             return 1; | ||||||
|         } |         } | ||||||
|   | |||||||
							
								
								
									
										27
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										27
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -12189,6 +12189,33 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * | |||||||
|     return (n/QK4_3*sizeof(block_q4_3)); |     return (n/QK4_3*sizeof(block_q4_3)); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { | ||||||
|  |     size_t result = 0; | ||||||
|  |     switch (type) { | ||||||
|  |         case GGML_TYPE_Q4_0: | ||||||
|  |             { | ||||||
|  |                 GGML_ASSERT(start % QK4_0 == 0); | ||||||
|  |                 block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; | ||||||
|  |                 result = ggml_quantize_q4_0(src + start, block, n, n, hist); | ||||||
|  |             } break; | ||||||
|  |         case GGML_TYPE_Q4_1: | ||||||
|  |             { | ||||||
|  |                 GGML_ASSERT(start % QK4_1 == 0); | ||||||
|  |                 block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; | ||||||
|  |                 result = ggml_quantize_q4_1(src + start, block, n, n, hist); | ||||||
|  |             } break; | ||||||
|  |         case GGML_TYPE_Q4_2: | ||||||
|  |             { | ||||||
|  |                 GGML_ASSERT(start % QK4_2 == 0); | ||||||
|  |                 block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; | ||||||
|  |                 result = ggml_quantize_q4_2(src + start, block, n, n, hist); | ||||||
|  |             } break; | ||||||
|  |         default: | ||||||
|  |             assert(false); | ||||||
|  |     } | ||||||
|  |     return result; | ||||||
|  | } | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| int ggml_cpu_has_avx(void) { | int ggml_cpu_has_avx(void) { | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								ggml.h
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								ggml.h
									
									
									
									
									
								
							| @@ -813,6 +813,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * | |||||||
| size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); | size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
| size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); | size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|  |  | ||||||
|  | size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); | ||||||
|  |  | ||||||
| // | // | ||||||
| // system info | // system info | ||||||
| // | // | ||||||
|   | |||||||
							
								
								
									
										67
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										67
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -24,6 +24,9 @@ | |||||||
| #include <memory> | #include <memory> | ||||||
| #include <algorithm> | #include <algorithm> | ||||||
| #include <initializer_list> | #include <initializer_list> | ||||||
|  | #include <thread> | ||||||
|  | #include <atomic> | ||||||
|  | #include <mutex> | ||||||
|  |  | ||||||
| #define LLAMA_USE_SCRATCH | #define LLAMA_USE_SCRATCH | ||||||
| #define LLAMA_MAX_SCRATCH_BUFFERS 16 | #define LLAMA_MAX_SCRATCH_BUFFERS 16 | ||||||
| @@ -1572,7 +1575,7 @@ static llama_vocab::id llama_sample_top_p_top_k( | |||||||
| // quantization | // quantization | ||||||
| // | // | ||||||
|  |  | ||||||
| static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { | static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { | ||||||
|     ggml_type quantized_type; |     ggml_type quantized_type; | ||||||
|     switch (ftype) { |     switch (ftype) { | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; |         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; | ||||||
| @@ -1582,6 +1585,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|         default: throw format("invalid output file type %d\n", ftype); |         default: throw format("invalid output file type %d\n", ftype); | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |     if (nthread <= 0) { | ||||||
|  |         nthread = std::thread::hardware_concurrency(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, |     std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, | ||||||
|                                                                             /*vocab_only*/ false)); |                                                                             /*vocab_only*/ false)); | ||||||
|     llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); |     llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); | ||||||
| @@ -1590,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|     size_t total_size_new = 0; |     size_t total_size_new = 0; | ||||||
|     std::vector<int64_t> hist_all(1 << 4, 0); |     std::vector<int64_t> hist_all(1 << 4, 0); | ||||||
|  |  | ||||||
|  |     std::vector<std::thread> workers; | ||||||
|  |     std::mutex mutex; | ||||||
|  |  | ||||||
|     size_t idx = 0; |     size_t idx = 0; | ||||||
|     for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { |     for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { | ||||||
|         llama_buffer read_data; |         llama_buffer read_data; | ||||||
| @@ -1643,25 +1653,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|             new_data = work.addr; |             new_data = work.addr; | ||||||
|             std::vector<int64_t> hist_cur(1 << 4, 0); |             std::vector<int64_t> hist_cur(1 << 4, 0); | ||||||
|  |  | ||||||
|             switch (new_type) { |             int chunk_size = 32 * 512; | ||||||
|                 case GGML_TYPE_Q4_0: |             const int nchunk = (nelements + chunk_size - 1)/chunk_size; | ||||||
|                     { |             const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; | ||||||
|                         new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); |             if (nthread_use < 2) { | ||||||
|                     } break; |                 new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); | ||||||
|                 case GGML_TYPE_Q4_1: |             } else { | ||||||
|                     { |                 size_t counter = 0; | ||||||
|                         new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); |                 new_size = 0; | ||||||
|                     } break; |                 auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { | ||||||
|                 case GGML_TYPE_Q4_2: |                     std::vector<int64_t> local_hist; | ||||||
|                     { |                     size_t local_size = 0; | ||||||
|                         new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); |                     while (true) { | ||||||
|                     } break; |                         std::unique_lock<std::mutex> lock(mutex); | ||||||
|                 case GGML_TYPE_Q4_3: |                         size_t first = counter; counter += chunk_size; | ||||||
|                     { |                         if (first >= nelements) { | ||||||
|                         new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); |                             if (!local_hist.empty()) { | ||||||
|                     } break; |                                 for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j]; | ||||||
|                 default: |                                 new_size += local_size; | ||||||
|                     LLAMA_ASSERT(false); |                             } | ||||||
|  |                             break; | ||||||
|  |                         } | ||||||
|  |                         lock.unlock(); | ||||||
|  |                         size_t last = std::min(nelements, first + chunk_size); | ||||||
|  |                         if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0); | ||||||
|  |                         local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); | ||||||
|  |                     } | ||||||
|  |                 }; | ||||||
|  |                 if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1); | ||||||
|  |                 for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute); | ||||||
|  |                 compute(); | ||||||
|  |                 for (int it = 0; it < nthread_use - 1; ++it) workers[it].join(); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); |             printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); | ||||||
| @@ -1783,9 +1805,10 @@ void llama_free(struct llama_context * ctx) { | |||||||
| int llama_model_quantize( | int llama_model_quantize( | ||||||
|         const char * fname_inp, |         const char * fname_inp, | ||||||
|         const char * fname_out, |         const char * fname_out, | ||||||
|   enum llama_ftype   ftype) { |   enum llama_ftype   ftype, | ||||||
|  |         int          nthread) { | ||||||
|     try { |     try { | ||||||
|         llama_model_quantize_internal(fname_inp, fname_out, ftype); |         llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); | ||||||
|         return 0; |         return 0; | ||||||
|     } catch (const std::string & err) { |     } catch (const std::string & err) { | ||||||
|         fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); |         fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); | ||||||
|   | |||||||
							
								
								
									
										4
									
								
								llama.h
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								llama.h
									
									
									
									
									
								
							| @@ -93,10 +93,12 @@ extern "C" { | |||||||
|  |  | ||||||
|     // TODO: not great API - very likely to change |     // TODO: not great API - very likely to change | ||||||
|     // Returns 0 on success |     // Returns 0 on success | ||||||
|  |     // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given | ||||||
|     LLAMA_API int llama_model_quantize( |     LLAMA_API int llama_model_quantize( | ||||||
|             const char * fname_inp, |             const char * fname_inp, | ||||||
|             const char * fname_out, |             const char * fname_out, | ||||||
|       enum llama_ftype   ftype); |       enum llama_ftype   ftype, | ||||||
|  |             int          nthread); | ||||||
|  |  | ||||||
|     // Apply a LoRA adapter to a loaded model |     // Apply a LoRA adapter to a loaded model | ||||||
|     // path_base_model is the path to a higher quality model to use as a base for |     // path_base_model is the path to a higher quality model to use as a base for | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow