mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : fix quantization of shared token_embd (#5944)
This commit is contained in:
		| @@ -10973,6 +10973,9 @@ struct quantize_state_internal { | |||||||
|  |  | ||||||
|     bool has_imatrix      = false; |     bool has_imatrix      = false; | ||||||
|  |  | ||||||
|  |     // used to figure out if a model shares tok_embd with the output weight | ||||||
|  |     bool has_output       = false; | ||||||
|  |  | ||||||
|     quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) |     quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) | ||||||
|         : model(model) |         : model(model) | ||||||
|         , params(params) |         , params(params) | ||||||
| @@ -11070,8 +11073,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty | |||||||
|  |  | ||||||
|     // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings |     // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings | ||||||
|     // with the quantization of the output tensor |     // with the quantization of the output tensor | ||||||
|     if (name == tn(LLM_TENSOR_OUTPUT, "weight") || |     if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { | ||||||
|         (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) { |  | ||||||
|         int nx = tensor->ne[0]; |         int nx = tensor->ne[0]; | ||||||
|         if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { |         if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { | ||||||
|             new_type = GGML_TYPE_Q8_0; |             new_type = GGML_TYPE_Q8_0; | ||||||
| @@ -11460,6 +11462,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|         else if (name.find("ffn_up") != std::string::npos) { |         else if (name.find("ffn_up") != std::string::npos) { | ||||||
|             ++qs.n_ffn_up; |             ++qs.n_ffn_up; | ||||||
|         } |         } | ||||||
|  |         else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { | ||||||
|  |             qs.has_output = true; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|     if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { |     if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { | ||||||
|         LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", |         LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 compilade
					compilade