mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	model : add PLaMo-2 support (#14560)
* Add PLaMo-2 model using hybrid memory module * Fix z shape * Add cmath to include from llama-vocab.h * Explicitly dequantize normalization weights before RoPE apply * Revert unnecessary cast because the problem can be solved by excluding attn_k, attn_q when quantizing * Use ATTN_K/Q_NORM for k,q weights to prevent quantization * Remove SSM_BCDT that is not used from anywhere * Do not duplicate embedding weights for output.weight * Fix tokenizer encoding problem for multibyte strings * Apply suggestion from @CISC Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Use LLM_FFN_SWIGLU instead of splitting ffn_gate and ffn_up * Remove unnecessary part for Grouped Query Attention * Fix how to load special token id to gguf * Remove unused tensor mapping * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove llama_vocab_plamo2 class and replace it with llm_tokenizer_plamo2_session to follow the other tokenizer implementations * Update src/llama-vocab.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Fix plamo2 tokenizer session to prevent multiple calls of build() --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -34,6 +34,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { | ||||
|     { LLM_ARCH_PHI3,             "phi3"             }, | ||||
|     { LLM_ARCH_PHIMOE,           "phimoe"           }, | ||||
|     { LLM_ARCH_PLAMO,            "plamo"            }, | ||||
|     { LLM_ARCH_PLAMO2,           "plamo2"           }, | ||||
|     { LLM_ARCH_CODESHELL,        "codeshell"        }, | ||||
|     { LLM_ARCH_ORION,            "orion"            }, | ||||
|     { LLM_ARCH_INTERNLM2,        "internlm2"        }, | ||||
| @@ -784,6 +785,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N | ||||
|             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" }, | ||||
|         }, | ||||
|     }, | ||||
|     { | ||||
|         LLM_ARCH_PLAMO2, | ||||
|         { | ||||
|             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" }, | ||||
|             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" }, | ||||
|             { LLM_TENSOR_OUTPUT,          "output" }, | ||||
|             { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" }, | ||||
|             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" }, | ||||
|             { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" }, | ||||
|             { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" }, | ||||
|             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" }, | ||||
|             { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" }, | ||||
|             { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" }, | ||||
|             { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" }, | ||||
|             { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" }, | ||||
|             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" }, | ||||
|             { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in" }, | ||||
|             { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d" }, | ||||
|             { LLM_TENSOR_SSM_X,           "blk.%d.ssm_x" }, | ||||
|             { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt" }, | ||||
|             { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" }, | ||||
|             { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" }, | ||||
|             { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" }, | ||||
|             { LLM_TENSOR_SSM_DT_NORM,     "blk.%d.ssm_dt_norm" }, | ||||
|             { LLM_TENSOR_SSM_B_NORM,      "blk.%d.ssm_b_norm" }, | ||||
|             { LLM_TENSOR_SSM_C_NORM,      "blk.%d.ssm_c_norm" }, | ||||
|             { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" }, | ||||
|             { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" }, | ||||
|         }, | ||||
|     }, | ||||
|     { | ||||
|         LLM_ARCH_CODESHELL, | ||||
|         { | ||||
| @@ -2094,6 +2125,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { | ||||
|     switch (arch) { | ||||
|         case LLM_ARCH_JAMBA: | ||||
|         case LLM_ARCH_FALCON_H1: | ||||
|         case LLM_ARCH_PLAMO2: | ||||
|         case LLM_ARCH_GRANITE_HYBRID: | ||||
|         case LLM_ARCH_LFM2: | ||||
|             return true; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Shunta Saito
					Shunta Saito