mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	gemma : use more bits for the token_embd.weight tensor (#5650)
* gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type
This commit is contained in:
		@@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
 | 
				
			|||||||
        return std::make_pair(i_layer, n_layer);
 | 
					        return std::make_pair(i_layer, n_layer);
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
 | 
					    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
 | 
				
			||||||
 | 
					    // with the quantization of the output tensor
 | 
				
			||||||
 | 
					    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
 | 
				
			||||||
 | 
					        (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
 | 
				
			||||||
        int nx = tensor->ne[0];
 | 
					        int nx = tensor->ne[0];
 | 
				
			||||||
        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
 | 
					        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
 | 
				
			||||||
            new_type = GGML_TYPE_Q8_0;
 | 
					            new_type = GGML_TYPE_Q8_0;
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user