mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama-quant: add support for mmproj (#16592)
* llama-quant: add support for mmproj * Update src/llama.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * check prefix instead * small fix --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -5,6 +5,7 @@ | ||||
| #include <map> | ||||
|  | ||||
| static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { | ||||
|     { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize | ||||
|     { LLM_ARCH_LLAMA,            "llama"            }, | ||||
|     { LLM_ARCH_LLAMA4,           "llama4"           }, | ||||
|     { LLM_ARCH_DECI,             "deci"             }, | ||||
| @@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { | ||||
| }; | ||||
|  | ||||
| static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = { | ||||
|     { | ||||
|         LLM_ARCH_CLIP, | ||||
|         {}, | ||||
|     }, | ||||
|     { | ||||
|         LLM_ARCH_LLAMA, | ||||
|         { | ||||
|   | ||||
| @@ -9,6 +9,7 @@ | ||||
| // | ||||
|  | ||||
| enum llm_arch { | ||||
|     LLM_ARCH_CLIP, | ||||
|     LLM_ARCH_LLAMA, | ||||
|     LLM_ARCH_LLAMA4, | ||||
|     LLM_ARCH_DECI, | ||||
|   | ||||
| @@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { | ||||
|     ml.get_key(LLM_KV_GENERAL_NAME, name, false); | ||||
|  | ||||
|     // everything past this point is not vocab-related | ||||
|     if (hparams.vocab_only) { | ||||
|     // for CLIP models, we only need to load tensors, no hparams | ||||
|     if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
| @@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) { | ||||
| llama_rope_type llama_model_rope_type(const llama_model * model) { | ||||
|     switch (model->arch) { | ||||
|         // these models do not use RoPE | ||||
|         case LLM_ARCH_CLIP: | ||||
|         case LLM_ARCH_GPT2: | ||||
|         case LLM_ARCH_GPTJ: | ||||
|         case LLM_ARCH_MPT: | ||||
|   | ||||
| @@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     bool is_clip_model = false; | ||||
|     for (const auto * it : tensors) { | ||||
|         const struct ggml_tensor * tensor = it->tensor; | ||||
|  | ||||
| @@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: | ||||
|         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { | ||||
|             qs.has_output = true; | ||||
|         } | ||||
|  | ||||
|         is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix | ||||
|     } | ||||
|  | ||||
|     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; | ||||
|  | ||||
|     // sanity checks for models that have attention layers | ||||
|     if (qs.n_attention_wv != 0) | ||||
|     if (qs.n_attention_wv != 0 && !is_clip_model) | ||||
|     { | ||||
|         const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); | ||||
|         // attention layers have a non-zero number of kv heads | ||||
| @@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: | ||||
|         // do not quantize relative position bias (T5) | ||||
|         quantize &= name.find("attn_rel_b.weight") == std::string::npos; | ||||
|  | ||||
|         // do not quantize specific multimodal tensors | ||||
|         quantize &= name.find(".position_embd.") == std::string::npos; | ||||
|  | ||||
|         ggml_type new_type; | ||||
|         void * new_data; | ||||
|         size_t new_size; | ||||
|   | ||||
| @@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string> | ||||
|         } catch(const std::exception & e) { | ||||
|             throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); | ||||
|         } | ||||
|         if (model.arch == LLM_ARCH_CLIP) { | ||||
|             throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead"); | ||||
|         } | ||||
|         try { | ||||
|             model.load_vocab(ml); | ||||
|         } catch(const std::exception & e) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan-Son Nguyen
					Xuan-Son Nguyen