mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : allow quantizing k-quants to fall back when tensor size incompatible (#3747)
* Allow quantizing k-quants to fall back when tensor size incompatible * quantizing: Add warning when tensors were incompatible with k-quants Clean up k-quants state passing a bit
This commit is contained in:
		
							
								
								
									
										108
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										108
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -8049,6 +8049,24 @@ struct no_init { | ||||
|     no_init() { /* do nothing */ } | ||||
| }; | ||||
|  | ||||
| struct quantize_state_internal { | ||||
|     const llama_model                 & model; | ||||
|     const llama_model_quantize_params * params; | ||||
| #ifdef GGML_USE_K_QUANTS | ||||
|     int n_attention_wv    = 0; | ||||
|     int n_feed_forward_w2 = 0; | ||||
|     int i_attention_wv    = 0; | ||||
|     int i_feed_forward_w2 = 0; | ||||
|  | ||||
|     int n_k_quantized     = 0; | ||||
|     int n_fallback        = 0; | ||||
| #endif | ||||
|     quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) | ||||
|         : model(model) | ||||
|         , params(params) | ||||
|         {} | ||||
| }; | ||||
|  | ||||
| static void llama_convert_tensor_internal( | ||||
|     struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, | ||||
|     const size_t nelements, const int nthread | ||||
| @@ -8109,12 +8127,13 @@ static void llama_convert_tensor_internal( | ||||
|  | ||||
| #ifdef GGML_USE_K_QUANTS | ||||
| static ggml_type get_k_quant_type( | ||||
|     ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv, | ||||
|     int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2 | ||||
|     quantize_state_internal & qs, | ||||
|     ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype | ||||
| ) { | ||||
|     const std::string name = ggml_get_name(tensor); | ||||
|     // TODO: avoid hardcoded tensor names - use the TN_* constants | ||||
|     const auto tn = LLM_TN(model.arch); | ||||
|     const llm_arch arch = qs.model.arch; | ||||
|     const auto       tn = LLM_TN(arch); | ||||
|  | ||||
|     auto use_more_bits = [](int i_layer, int num_layers) -> bool { | ||||
|         return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; | ||||
| @@ -8122,7 +8141,7 @@ static ggml_type get_k_quant_type( | ||||
|  | ||||
|     if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { | ||||
|         int nx = tensor->ne[0]; | ||||
|         if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { | ||||
|         if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { | ||||
|             new_type = GGML_TYPE_Q8_0; | ||||
|         } | ||||
|         else if (new_type != GGML_TYPE_Q8_0) { | ||||
| @@ -8131,46 +8150,46 @@ static ggml_type get_k_quant_type( | ||||
|     } else if (name.find("attn_v.weight") != std::string::npos) { | ||||
|         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { | ||||
|             new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; | ||||
|             new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; | ||||
|         } | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; | ||||
|         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && | ||||
|                 use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; | ||||
|                 use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; | ||||
|         else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && | ||||
|                 (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; | ||||
|         if (model.type == MODEL_70B) { | ||||
|                 (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; | ||||
|         if (qs.model.type == MODEL_70B) { | ||||
|             // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is | ||||
|             // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with | ||||
|             // nearly negligible increase in model size by quantizing this tensor with more bits: | ||||
|             if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; | ||||
|         } | ||||
|         ++*i_attention_wv; | ||||
|         ++qs.i_attention_wv; | ||||
|     } else if (name.find("ffn_down.weight") != std::string::npos) { | ||||
|         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { | ||||
|             new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K | ||||
|                      : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K | ||||
|             new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K | ||||
|                      : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K | ||||
|                      : GGML_TYPE_Q3_K; | ||||
|         } | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { | ||||
|             new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; | ||||
|             new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; | ||||
|         } | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { | ||||
|             if (model.arch == LLM_ARCH_FALCON) { | ||||
|                 new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : | ||||
|                            use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; | ||||
|             if (arch == LLM_ARCH_FALCON) { | ||||
|                 new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : | ||||
|                            use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; | ||||
|             } else { | ||||
|                 if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; | ||||
|                 if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; | ||||
|             } | ||||
|         } | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) { | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; | ||||
|         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) { | ||||
|             new_type = GGML_TYPE_Q5_K; | ||||
|         } | ||||
|         ++*i_feed_forward_w2; | ||||
|         ++qs.i_feed_forward_w2; | ||||
|     } else if (name.find("attn_output.weight") != std::string::npos) { | ||||
|         if (model.arch != LLM_ARCH_FALCON) { | ||||
|         if (arch != LLM_ARCH_FALCON) { | ||||
|             if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K; | ||||
|             else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; | ||||
|             else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; | ||||
| @@ -8197,20 +8216,23 @@ static ggml_type get_k_quant_type( | ||||
|         int nx = tensor->ne[0]; | ||||
|         int ny = tensor->ne[1]; | ||||
|         if (nx % QK_K != 0) { | ||||
|             LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); | ||||
|             LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type)); | ||||
|             convert_incompatible_tensor = true; | ||||
|         } else { | ||||
|             ++qs.n_k_quantized; | ||||
|         } | ||||
|     } | ||||
|     if (convert_incompatible_tensor) { | ||||
|         if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { | ||||
|             new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. | ||||
|             LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); | ||||
|         } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { | ||||
|             new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. | ||||
|             LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); | ||||
|         } else { | ||||
|             throw std::runtime_error("Unsupported tensor size encountered\n"); | ||||
|         switch (new_type) { | ||||
|             case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; | ||||
|             case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; | ||||
|             case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; | ||||
|             case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; | ||||
|             case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; | ||||
|             default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); | ||||
|         } | ||||
|         LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); | ||||
|         ++qs.n_fallback; | ||||
|     } | ||||
|  | ||||
|     return new_type; | ||||
| @@ -8268,6 +8290,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     llm_load_arch(ml, model); | ||||
|     llm_load_hparams(ml, model); | ||||
|  | ||||
|     struct quantize_state_internal qs(model, params); | ||||
|  | ||||
|     if (params->only_copy) { | ||||
|         ftype = model.ftype; | ||||
|     } | ||||
| @@ -8281,9 +8305,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     gguf_set_val_u32(ctx_out, "general.file_type", ftype); | ||||
|  | ||||
| #ifdef GGML_USE_K_QUANTS | ||||
|     int n_attention_wv    = 0; | ||||
|     int n_feed_forward_w2 = 0; | ||||
|  | ||||
|     for (int i = 0; i < ml.n_tensors; ++i) { | ||||
|         struct ggml_tensor * meta = ml.get_tensor_meta(i); | ||||
|  | ||||
| @@ -8291,19 +8312,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|  | ||||
|         // TODO: avoid hardcoded tensor names - use the TN_* constants | ||||
|         if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { | ||||
|             ++n_attention_wv; | ||||
|             ++qs.n_attention_wv; | ||||
|         } | ||||
|         else if (name.find("ffn_down.weight") != std::string::npos) { | ||||
|             ++n_feed_forward_w2; | ||||
|             ++qs.n_feed_forward_w2; | ||||
|         } | ||||
|     } | ||||
|     if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) { | ||||
|     if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { | ||||
|         LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", | ||||
|                 __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer); | ||||
|                 __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); | ||||
|     } | ||||
|  | ||||
|     int i_attention_wv = 0; | ||||
|     int i_feed_forward_w2 = 0; | ||||
| #endif | ||||
|  | ||||
|     size_t total_size_org = 0; | ||||
| @@ -8370,9 +8388,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|         if (quantize) { | ||||
|             new_type = quantized_type; | ||||
| #ifdef GGML_USE_K_QUANTS | ||||
|             new_type = get_k_quant_type( | ||||
|                 new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2 | ||||
|             ); | ||||
|             new_type = get_k_quant_type(qs, new_type, tensor, ftype); | ||||
| #endif | ||||
|             // If we've decided to quantize to the same type the tensor is already | ||||
|             // in then there's nothing to do. | ||||
| @@ -8498,6 +8514,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|             LLAMA_LOG_INFO("\n"); | ||||
|         } | ||||
|     } | ||||
| #ifdef GGML_USE_K_QUANTS | ||||
|     if (qs.n_fallback > 0) { | ||||
|         LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", | ||||
|                 __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); | ||||
|     } | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static int llama_apply_lora_from_file_internal( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kerfuffle
					Kerfuffle