mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-30 08:42:00 +00:00
model : add grok-2 support (#15539)
* add grok-2 support * type fix * type fix * type fix * "fix" vocab for invalid sequences * fix expert tensor mapping and spaces in vocab * add chat template * fix norm tensor mapping * rename layer_out_norm to ffn_post_norm * ensure ffn_post_norm is mapped * fix experts merging * remove erroneous FFN_GATE entry * concatenate split tensors and add more metadata * process all expert layers and try cat instead of hstack * add support for community BPE vocab * fix expert feed forward length and ffn_down concat * commit this too * add ffn_up/gate/down, unsure if sequence is right * add ffn_gate/down/up to tensor names * correct residual moe (still not working) * mess-- * fix embedding scale being applied twice * add built in chat template * change beta fast for grok if default value * remove spm vocab in favor of community bpe vocab * change attention temp length metadata type to integer * update attention temp length metadata * remove comment * replace M_SQRT2 with std::sqrt(2) * add yarn metadata, move defaults to hparams
This commit is contained in:
@@ -136,6 +136,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.norm", # mamba-qbert
|
||||
"backbone.layers.{bid}.norm", # mamba
|
||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||
"model.layers.{bid}.pre_attn_norm", # grok-2
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||
"encoder.layers.{bid}.input_layernorm", # chatglm
|
||||
"transformer.layers.{bid}.attn_norm", # openelm
|
||||
@@ -278,6 +279,7 @@ class TensorNameMap:
|
||||
"transformer.layer.{bid}.sa_layer_norm", # distillbert
|
||||
"encoder.layers.{bid}.norm1", # nomic-bert
|
||||
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
||||
"model.layers.{bid}.post_attn_norm", # grok-2
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||
),
|
||||
|
||||
@@ -313,6 +315,7 @@ class TensorNameMap:
|
||||
"h.{bid}.ln_2", # gpt2
|
||||
"model.layers.{bid}.ffn_norm", # internlm2
|
||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||
"model.layers.{bid}.pre_moe_norm", # grok-2
|
||||
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||
"model.layers.{bid}.pre_ff_layernorm", # jamba granite-hybrid
|
||||
@@ -333,11 +336,12 @@ class TensorNameMap:
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||
"layers.{bid}.post_feedforward_layernorm", # embeddinggemma
|
||||
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||
"layers.{bid}.post_feedforward_layernorm", # embeddinggemma
|
||||
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
|
||||
"model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
|
||||
"model.layers.{bid}.feed_forward.up_proj",
|
||||
"model.layers.{bid}.post_moe_norm", # grok-2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
|
||||
Reference in New Issue
Block a user