mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	model: support GLM 4.5 family of models (#14939)
* model: Add GLM 4.5 (#14921) Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Merge in PR suggestions Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * model: Add GLM 4.5 family of models (#14921) 1. Updated tensor_mapping.py with NextN tensor mappings - Added proper tensor mappings for all NextN/MTP tensors in /Users/samm/git/llama.cpp/gguf-py/gguf/tensor_mapping.py - Added mappings for: eh_proj, embed_tokens, enorm, hnorm, shared_head.head, shared_head.norm 2. Added num_nextn_predict_layers configuration - Added LLM_KV_NUM_NEXTN_PREDICT_LAYERS constant to llama-arch.h and llama-arch.cpp - Added num_nextn_predict_layers field to llama_hparams struct - Updated GLM4_MOE parameter loading in llama-model.cpp to read this parameter - Modified tensor loading logic to conditionally load NextN tensors based on num_nextn_predict_layers - Added GGUF writer support in gguf_writer.py with add_num_nextn_predict_layers() method - Updated conversion script to extract and write this parameter from HuggingFace config 3. Added FIM tokens for GLM4_MOE - Added GLM-4.5's FIM tokens to llama-vocab.cpp: - <|code_prefix|> for FIM_PRE - <|code_suffix|> for FIM_SUF - <|code_middle|> for FIM_MID 4. Removed manual NextN tensor handling - Removed the special-case handling in convert_hf_to_gguf.py that manually mapped NextN tensors - NextN tensors are now handled automatically through the proper tensor mapping system * glm 4.5 update tensors names * model: glm 4.5 apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * model: glm 4.5 apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * model: glm 4.5 apply suggestions from code review * Apply suggestions from code review * patch broken chat template * typings fix * add TENSOR_SKIP flag Co-authored-by: Diego Devesa <slarengh@gmail.com> * Update src/llama-model-loader.h Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: Diego Devesa <slarengh@gmail.com>
This commit is contained in:
		| @@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { | ||||
|     { LLM_ARCH_DEEPSEEK2,        "deepseek2"        }, | ||||
|     { LLM_ARCH_CHATGLM,          "chatglm"          }, | ||||
|     { LLM_ARCH_GLM4,             "glm4"             }, | ||||
|     { LLM_ARCH_GLM4_MOE,         "glm4moe"          }, | ||||
|     { LLM_ARCH_BITNET,           "bitnet"           }, | ||||
|     { LLM_ARCH_T5,               "t5"               }, | ||||
|     { LLM_ARCH_T5ENCODER,        "t5encoder"        }, | ||||
| @@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { | ||||
|     { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               }, | ||||
|     { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                }, | ||||
|     { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                }, | ||||
|     { LLM_KV_NEXTN_PREDICT_LAYERS,              "%s.nextn_predict_layers"              }, | ||||
|     { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      }, | ||||
|     { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       }, | ||||
|     { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            }, | ||||
| @@ -1391,6 +1393,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N | ||||
|             { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" }, | ||||
|         }, | ||||
|     }, | ||||
|     { | ||||
|         LLM_ARCH_GLM4_MOE, | ||||
|         { | ||||
|             { LLM_TENSOR_TOKEN_EMBD,         "token_embd" }, | ||||
|             { LLM_TENSOR_OUTPUT_NORM,        "output_norm" }, | ||||
|             { LLM_TENSOR_OUTPUT,             "output" }, | ||||
|             { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" }, | ||||
|             { LLM_TENSOR_ATTN_POST_NORM,     "blk.%d.post_attention_norm" }, | ||||
|             { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" }, | ||||
|             { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" }, | ||||
|             { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" }, | ||||
|             { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" }, | ||||
|             { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" }, | ||||
|             { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" }, | ||||
|             { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" }, | ||||
|             { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" }, | ||||
|             { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" }, | ||||
|             { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" }, | ||||
|             { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" }, | ||||
|             { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" }, | ||||
|             { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" }, | ||||
|             { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" }, | ||||
|             { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" }, | ||||
|             { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" }, | ||||
|             { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" }, | ||||
|             // NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number) | ||||
|             { LLM_TENSOR_NEXTN_EH_PROJ,      "blk.%d.nextn.eh_proj" }, | ||||
|             { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, | ||||
|             { LLM_TENSOR_NEXTN_ENORM,        "blk.%d.nextn.enorm" }, | ||||
|             { LLM_TENSOR_NEXTN_HNORM,        "blk.%d.nextn.hnorm" }, | ||||
|             { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, | ||||
|             { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, | ||||
|         }, | ||||
|     }, | ||||
|     { | ||||
|         LLM_ARCH_BITNET, | ||||
|         { | ||||
| @@ -2181,6 +2217,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { | ||||
|     {LLM_TENSOR_SHORTCONV_CONV,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, | ||||
|     {LLM_TENSOR_SHORTCONV_INPROJ,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_SHORTCONV_OUTPROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     // NextN/MTP tensors are currently ignored (reserved for future MTP support) | ||||
|     // These tensors only exist in the last layer(s) and are treated as output tensors | ||||
|     {LLM_TENSOR_NEXTN_EH_PROJ,              {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_NEXTN_EMBED_TOKENS,         {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, | ||||
|     {LLM_TENSOR_NEXTN_ENORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, | ||||
|     {LLM_TENSOR_NEXTN_HNORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, | ||||
|     {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, | ||||
| }; | ||||
|  | ||||
| LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Sam
					Sam