mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	quantize : be able to override metadata by key (#6321)
* quantize: be able to override metadata by key * minor : spacing --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -87,13 +87,17 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp | ||||
| // | ||||
| [[noreturn]] | ||||
| static void usage(const char * executable) { | ||||
|     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); | ||||
|     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); | ||||
|     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); | ||||
|     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); | ||||
|     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); | ||||
|     printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); | ||||
|     printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); | ||||
|     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); | ||||
|     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); | ||||
|     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); | ||||
|     printf("  --override-kv KEY=TYPE:VALUE\n"); | ||||
|     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); | ||||
|     printf("Note: --include-weights and --exclude-weights cannot be used together\n"); | ||||
|     printf("\nAllowed quantization types:\n"); | ||||
|     for (auto & it : QUANT_OPTIONS) { | ||||
| @@ -107,14 +111,14 @@ static void usage(const char * executable) { | ||||
|     exit(1); | ||||
| } | ||||
|  | ||||
| static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) { | ||||
| static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) { | ||||
|     std::ifstream in(imatrix_file.c_str(), std::ios::binary); | ||||
|     if (!in) { | ||||
|         printf("%s: failed to open %s\n",__func__,imatrix_file.c_str()); | ||||
|         printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); | ||||
|         return; | ||||
|     } | ||||
|     int n_entries; | ||||
|     in.read((char*)&n_entries, sizeof(n_entries)); | ||||
|     in.read((char *)&n_entries, sizeof(n_entries)); | ||||
|     if (in.fail() || n_entries < 1) { | ||||
|         printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); | ||||
|         return; | ||||
| @@ -124,25 +128,25 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std | ||||
|         std::vector<char> name_as_vec(len+1); | ||||
|         in.read((char *)name_as_vec.data(), len); | ||||
|         if (in.fail()) { | ||||
|             printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str()); | ||||
|             printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); | ||||
|             return; | ||||
|         } | ||||
|         name_as_vec[len] = 0; | ||||
|         std::string name{name_as_vec.data()}; | ||||
|         auto& e = imatrix_data[std::move(name)]; | ||||
|         auto & e = imatrix_data[std::move(name)]; | ||||
|         int ncall; | ||||
|         in.read((char*)&ncall, sizeof(ncall)); | ||||
|         in.read((char *)&ncall, sizeof(ncall)); | ||||
|         int nval; | ||||
|         in.read((char *)&nval, sizeof(nval)); | ||||
|         if (in.fail() || nval < 1) { | ||||
|             printf("%s: failed reading number of values for entry %d\n",__func__,i); | ||||
|             printf("%s: failed reading number of values for entry %d\n", __func__, i); | ||||
|             imatrix_data = {}; | ||||
|             return; | ||||
|         } | ||||
|         e.resize(nval); | ||||
|         in.read((char*)e.data(), nval*sizeof(float)); | ||||
|         in.read((char *)e.data(), nval*sizeof(float)); | ||||
|         if (in.fail()) { | ||||
|             printf("%s: failed reading data for entry %d\n",__func__,i); | ||||
|             printf("%s: failed reading data for entry %d\n", __func__, i); | ||||
|             imatrix_data = {}; | ||||
|             return; | ||||
|         } | ||||
| @@ -150,13 +154,13 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std | ||||
|             for (auto& v : e) v /= ncall; | ||||
|         } | ||||
|     } | ||||
|     printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str()); | ||||
|     printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str()); | ||||
| } | ||||
|  | ||||
| static void prepare_imatrix(const std::string& imatrix_file, | ||||
|         const std::vector<std::string>& included_weights, | ||||
|         const std::vector<std::string>& excluded_weights, | ||||
|         std::unordered_map<std::string, std::vector<float>>& imatrix_data) { | ||||
| static void prepare_imatrix(const std::string & imatrix_file, | ||||
|         const std::vector<std::string> & included_weights, | ||||
|         const std::vector<std::string> & excluded_weights, | ||||
|         std::unordered_map<std::string, std::vector<float>> & imatrix_data) { | ||||
|     if (!imatrix_file.empty()) { | ||||
|         load_imatrix(imatrix_file, imatrix_data); | ||||
|     } | ||||
| @@ -201,6 +205,43 @@ static ggml_type parse_ggml_type(const char * arg) { | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) { | ||||
|     const char* sep = strchr(data, '='); | ||||
|     if (sep == nullptr || sep - data >= 128) { | ||||
|         fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); | ||||
|         return false; | ||||
|     } | ||||
|     llama_model_kv_override kvo; | ||||
|     std::strncpy(kvo.key, data, sep - data); | ||||
|     kvo.key[sep - data] = 0; | ||||
|     sep++; | ||||
|     if (strncmp(sep, "int:", 4) == 0) { | ||||
|         sep += 4; | ||||
|         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; | ||||
|         kvo.int_value = std::atol(sep); | ||||
|     } else if (strncmp(sep, "float:", 6) == 0) { | ||||
|         sep += 6; | ||||
|         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; | ||||
|         kvo.float_value = std::atof(sep); | ||||
|     } else if (strncmp(sep, "bool:", 5) == 0) { | ||||
|         sep += 5; | ||||
|         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; | ||||
|         if (std::strcmp(sep, "true") == 0) { | ||||
|             kvo.bool_value = true; | ||||
|         } else if (std::strcmp(sep, "false") == 0) { | ||||
|             kvo.bool_value = false; | ||||
|         } else { | ||||
|             fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); | ||||
|             return false; | ||||
|         } | ||||
|     } else { | ||||
|         fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); | ||||
|         return false; | ||||
|     } | ||||
|     overrides.emplace_back(std::move(kvo)); | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| int main(int argc, char ** argv) { | ||||
|     if (argc < 3) { | ||||
|         usage(argv[0]); | ||||
| @@ -211,6 +252,7 @@ int main(int argc, char ** argv) { | ||||
|     int arg_idx = 1; | ||||
|     std::string imatrix_file; | ||||
|     std::vector<std::string> included_weights, excluded_weights; | ||||
|     std::vector<llama_model_kv_override> kv_overrides; | ||||
|  | ||||
|     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { | ||||
|         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { | ||||
| @@ -227,6 +269,10 @@ int main(int argc, char ** argv) { | ||||
|             } else { | ||||
|                 usage(argv[0]); | ||||
|             } | ||||
|         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { | ||||
|             if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) { | ||||
|                 usage(argv[0]); | ||||
|             } | ||||
|         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { | ||||
|             params.allow_requantize = true; | ||||
|         } else if (strcmp(argv[arg_idx], "--pure") == 0) { | ||||
| @@ -267,6 +313,11 @@ int main(int argc, char ** argv) { | ||||
|     if (!imatrix_data.empty()) { | ||||
|         params.imatrix = &imatrix_data; | ||||
|     } | ||||
|     if (!kv_overrides.empty()) { | ||||
|         kv_overrides.emplace_back(); | ||||
|         kv_overrides.back().key[0] = 0; | ||||
|         params.kv_overrides = &kv_overrides; | ||||
|     } | ||||
|  | ||||
|     llama_backend_init(); | ||||
|  | ||||
| @@ -288,8 +339,7 @@ int main(int argc, char ** argv) { | ||||
|         if (ftype_str == "COPY") { | ||||
|             params.only_copy = true; | ||||
|         } | ||||
|     } | ||||
|     else { | ||||
|     } else { | ||||
|         fname_out = argv[arg_idx]; | ||||
|         arg_idx++; | ||||
|  | ||||
|   | ||||
							
								
								
									
										38
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										38
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -12776,7 +12776,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     constexpr bool use_mmap = false; | ||||
| #endif | ||||
|  | ||||
|     llama_model_loader ml(fname_inp, use_mmap, NULL); | ||||
|     llama_model_kv_override * kv_overrides = nullptr; | ||||
|     if (params->kv_overrides) { | ||||
|         auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides; | ||||
|         kv_overrides = v->data(); | ||||
|     } | ||||
|     llama_model_loader ml(fname_inp, use_mmap, kv_overrides); | ||||
|     ml.init_mappings(false); // no prefetching? | ||||
|  | ||||
|     llama_model model; | ||||
| @@ -12805,6 +12810,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); | ||||
|     gguf_set_val_u32(ctx_out, "general.file_type", ftype); | ||||
|  | ||||
|     if (params->kv_overrides) { | ||||
|         const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides; | ||||
|         for (auto & o : overrides) { | ||||
|             if (o.key[0] == 0) break; | ||||
|             if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { | ||||
|                 gguf_set_val_f32(ctx_out, o.key, o.float_value); | ||||
|             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { | ||||
|                 gguf_set_val_i32(ctx_out, o.key, o.int_value); | ||||
|             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { | ||||
|                 gguf_set_val_bool(ctx_out, o.key, o.bool_value); | ||||
|             } else { | ||||
|                 LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i < ml.n_tensors; ++i) { | ||||
|         const struct ggml_tensor * meta = ml.get_tensor_meta(i); | ||||
|  | ||||
| @@ -12813,21 +12834,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|         // TODO: avoid hardcoded tensor names - use the TN_* constants | ||||
|         if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { | ||||
|             ++qs.n_attention_wv; | ||||
|         } | ||||
|         else if (name.find("ffn_down") != std::string::npos) { | ||||
|         } else if (name.find("ffn_down") != std::string::npos) { | ||||
|             ++qs.n_ffn_down; | ||||
|         } | ||||
|         else if (name.find("ffn_gate") != std::string::npos) { | ||||
|         } else if (name.find("ffn_gate") != std::string::npos) { | ||||
|             ++qs.n_ffn_gate; | ||||
|         } | ||||
|         else if (name.find("ffn_up") != std::string::npos) { | ||||
|         } else if (name.find("ffn_up") != std::string::npos) { | ||||
|             ++qs.n_ffn_up; | ||||
|         } | ||||
|         else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { | ||||
|         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { | ||||
|             qs.has_output = true; | ||||
|         } | ||||
|     } | ||||
|     if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { | ||||
|     if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) { | ||||
|         LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", | ||||
|                 __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer); | ||||
|     } | ||||
| @@ -13363,6 +13380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { | ||||
|         /*.only_copy                   =*/ false, | ||||
|         /*.pure                        =*/ false, | ||||
|         /*.imatrix                     =*/ nullptr, | ||||
|         /*.kv_overrides                =*/ nullptr, | ||||
|     }; | ||||
|  | ||||
|     return result; | ||||
|   | ||||
							
								
								
									
										1
									
								
								llama.h
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								llama.h
									
									
									
									
									
								
							| @@ -284,6 +284,7 @@ extern "C" { | ||||
|         bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored | ||||
|         bool pure;                           // quantize all tensors to the default type | ||||
|         void * imatrix;                      // pointer to importance matrix data | ||||
|         void * kv_overrides;                 // pointer to vector containing overrides | ||||
|     } llama_model_quantize_params; | ||||
|  | ||||
|     // grammar types | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow