mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	quantize : be able to override metadata by key (#6321)
* quantize: be able to override metadata by key * minor : spacing --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										38
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										38
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -12776,7 +12776,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     constexpr bool use_mmap = false; | ||||
| #endif | ||||
|  | ||||
|     llama_model_loader ml(fname_inp, use_mmap, NULL); | ||||
|     llama_model_kv_override * kv_overrides = nullptr; | ||||
|     if (params->kv_overrides) { | ||||
|         auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides; | ||||
|         kv_overrides = v->data(); | ||||
|     } | ||||
|     llama_model_loader ml(fname_inp, use_mmap, kv_overrides); | ||||
|     ml.init_mappings(false); // no prefetching? | ||||
|  | ||||
|     llama_model model; | ||||
| @@ -12805,6 +12810,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); | ||||
|     gguf_set_val_u32(ctx_out, "general.file_type", ftype); | ||||
|  | ||||
|     if (params->kv_overrides) { | ||||
|         const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides; | ||||
|         for (auto & o : overrides) { | ||||
|             if (o.key[0] == 0) break; | ||||
|             if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { | ||||
|                 gguf_set_val_f32(ctx_out, o.key, o.float_value); | ||||
|             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { | ||||
|                 gguf_set_val_i32(ctx_out, o.key, o.int_value); | ||||
|             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { | ||||
|                 gguf_set_val_bool(ctx_out, o.key, o.bool_value); | ||||
|             } else { | ||||
|                 LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i < ml.n_tensors; ++i) { | ||||
|         const struct ggml_tensor * meta = ml.get_tensor_meta(i); | ||||
|  | ||||
| @@ -12813,21 +12834,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|         // TODO: avoid hardcoded tensor names - use the TN_* constants | ||||
|         if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { | ||||
|             ++qs.n_attention_wv; | ||||
|         } | ||||
|         else if (name.find("ffn_down") != std::string::npos) { | ||||
|         } else if (name.find("ffn_down") != std::string::npos) { | ||||
|             ++qs.n_ffn_down; | ||||
|         } | ||||
|         else if (name.find("ffn_gate") != std::string::npos) { | ||||
|         } else if (name.find("ffn_gate") != std::string::npos) { | ||||
|             ++qs.n_ffn_gate; | ||||
|         } | ||||
|         else if (name.find("ffn_up") != std::string::npos) { | ||||
|         } else if (name.find("ffn_up") != std::string::npos) { | ||||
|             ++qs.n_ffn_up; | ||||
|         } | ||||
|         else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { | ||||
|         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { | ||||
|             qs.has_output = true; | ||||
|         } | ||||
|     } | ||||
|     if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { | ||||
|     if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) { | ||||
|         LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", | ||||
|                 __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer); | ||||
|     } | ||||
| @@ -13363,6 +13380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { | ||||
|         /*.only_copy                   =*/ false, | ||||
|         /*.pure                        =*/ false, | ||||
|         /*.imatrix                     =*/ nullptr, | ||||
|         /*.kv_overrides                =*/ nullptr, | ||||
|     }; | ||||
|  | ||||
|     return result; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow