mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : restore intended k-quants mixes for MoE models (#4872)
* Restore intended k-quants quantization mixes for MoE models * Update Q2_K_S values in the quantize tool Still using LLaMA-v1 PPL values in the quant description today does not make much sense. But let's leave this update for another PR. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										24
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -2586,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { | ||||
|         case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; | ||||
|  | ||||
|         // K-quants | ||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K - Medium"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; | ||||
| @@ -8955,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty | ||||
|             // TODO: explore better strategies | ||||
|             new_type = GGML_TYPE_Q8_0; | ||||
|         } | ||||
|     } else if (name.find("ffn_down.weight") != std::string::npos) { | ||||
|     } else if (name.find("ffn_down") != std::string::npos) { | ||||
|         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { | ||||
|             if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K; | ||||
|         } | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { | ||||
|             new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K | ||||
|             new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K | ||||
|                      : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K | ||||
|                      : GGML_TYPE_Q3_K; | ||||
|         } | ||||
| @@ -8967,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty | ||||
|         } | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { | ||||
|             if (arch == LLM_ARCH_FALCON) { | ||||
|                 new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : | ||||
|                 new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K : | ||||
|                            use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; | ||||
|             } else { | ||||
|                 if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; | ||||
|             } | ||||
|         } | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) { | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) { | ||||
|             new_type = GGML_TYPE_Q5_K; | ||||
|         } | ||||
|         ++qs.i_feed_forward_w2; | ||||
| @@ -8992,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; | ||||
|     } | ||||
|     else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { | ||||
|         if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; | ||||
|     } | ||||
|     // IK: let's remove this, else Q2_K is almost the same as Q3_K_S | ||||
|     //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) { | ||||
|     //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; | ||||
|     //} | ||||
|     // This can be used to reduce the size of the Q5_K_S model. | ||||
|     // The associated PPL increase is fully in line with the size reduction | ||||
|     //else { | ||||
| @@ -9043,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|  | ||||
|         // K-quants | ||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_S: | ||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_M: | ||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; | ||||
| @@ -9101,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|         if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { | ||||
|             ++qs.n_attention_wv; | ||||
|         } | ||||
|         else if (name.find("ffn_down.weight") != std::string::npos) { | ||||
|         else if (name.find("ffn_down") != std::string::npos) { | ||||
|             ++qs.n_feed_forward_w2; | ||||
|         } | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow