model : add BailingMoeV2 support (#16063)

* add BailingMoeV2 support

* update llm types

* undo

* undo

* update llm types

* add model collection link

* update

* almost working

* correct group selection and rename n_group_exp

* avoid large top_k and use argmax instead for now

if we had something like argmax2 that would be equivalent, but this works fine until then

* poke

* skip group selection when there are no tokens

* fix 1T conversion

* hopefully fixed expert group selection

third time's the charm?

* make expert group selection generally available

The new LLaDA2Moe model uses this method too, make it generally available regardless of architecture.

* allow n_expert_groups to be 1 (Kimi K2)

* address review suggestions
This commit is contained in:
Sigbjørn Skjæret
2025-10-20 21:38:20 +02:00
committed by GitHub
parent c9c1972e2c
commit 84bf3c6778
15 changed files with 521 additions and 10 deletions

View File

@@ -174,6 +174,7 @@ class TensorNameMap:
"h.{bid}.self_attention.query_key_value", # bloom
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
"model.layers.{bid}.self_attn.query_key_value", # persimmon
"model.layers.{bid}.attention.query_key_value", # bailingmoe2
"h.{bid}.attn.c_attn", # gpt2
"transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
@@ -260,6 +261,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.out_proj", # gpt-j
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
"model.layers.{bid}.self_attn.dense", # persimmon
"model.layers.{bid}.attention.dense", # bailingmoe2
"h.{bid}.attn.c_proj", # gpt2
"transformer.h.{bid}.mixer.out_proj", # phi2
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
@@ -373,6 +375,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_EXP_PROBS_B: (
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
"model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe
"model.layers.{bid}.mlp.gate.expert_bias", # bailingmoe2
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
),
@@ -549,6 +552,7 @@ class TensorNameMap:
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.query_layernorm", # hunyuan
"model.layers.{bid}.attention.query_layernorm", # bailingmoe2
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
"layers.{bid}.self_attn.q_norm", # embeddinggemma
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
@@ -563,6 +567,7 @@ class TensorNameMap:
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.key_layernorm", # hunyuan
"model.layers.{bid}.attention.key_layernorm", # bailingmoe2
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
"layers.{bid}.self_attn.k_norm", # embeddinggemma
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
@@ -584,6 +589,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
"encoder.layer.{bid}.layer_norm_2", # jina-v2-code
"model.layers.{bid}.final_layernorm", # bailingmoe2
),
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (