mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	llama : only use Q6_K for output weights if tensor size is multiple of 256 (#1932)
* Only use Q6_K for output weights if tensor size is multiple of 256 * Fixed copy/paste mistake --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
		| @@ -2495,7 +2495,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|             if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K || |             if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K || | ||||||
|                 quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) { |                 quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) { | ||||||
|                 int nx = tensor.ne.at(0); |                 int nx = tensor.ne.at(0); | ||||||
|                 int ny = tensor.ne.at(0); |                 int ny = tensor.ne.at(1); | ||||||
|                 if (nx % QK_K != 0 || ny % QK_K != 0) { |                 if (nx % QK_K != 0 || ny % QK_K != 0) { | ||||||
|                     fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K); |                     fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K); | ||||||
|                     fprintf(stderr, "This is required to be able to use k-quants for now!\n"); |                     fprintf(stderr, "This is required to be able to use k-quants for now!\n"); | ||||||
| @@ -2504,7 +2504,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             if (tensor.name == "output.weight") { |             if (tensor.name == "output.weight") { | ||||||
|  |                 int nx = tensor.ne.at(0); | ||||||
|  |                 int ny = tensor.ne.at(1); | ||||||
|  |                 if (nx % QK_K == 0 && ny % QK_K == 0) { | ||||||
|                     new_type = GGML_TYPE_Q6_K; |                     new_type = GGML_TYPE_Q6_K; | ||||||
|  |                 } | ||||||
|             } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { |             } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { | ||||||
|                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; |                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; | ||||||
|                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; |                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow