Merge branch 'master' into compilade/refactor-kv-cache

This commit is contained in:
Francis Couture-Harpin
2025-07-09 09:38:48 -04:00
9 changed files with 608 additions and 17 deletions

View File

@@ -288,12 +288,14 @@ class TensorNameMap:
# Post feed-forward norm
MODEL_TENSOR.FFN_PRE_NORM: (
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
"model.layers.{bid}.pre_ff_layernorm.weight",
),
# Post feed-forward norm
MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
"model.layers.{bid}.feed_forward.up_proj",
),
MODEL_TENSOR.FFN_GATE_INP: (
@@ -367,6 +369,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
"model.layers.{bid}.feed_forward.down_proj",
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
),
@@ -559,13 +562,13 @@ class TensorNameMap:
MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj", # mamba-hf
"backbone.layers.{bid}.mixer.in_proj", # mamba
"model.layers.{bid}.mamba.in_proj", # jamba
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1
),
MODEL_TENSOR.SSM_CONV1D: (
"model.layers.{bid}.conv1d", # mamba-hf
"backbone.layers.{bid}.mixer.conv1d", # mamba
"model.layers.{bid}.mamba.conv1d", # jamba
"model.layers.{bid}.mamba.conv1d", # jamba falcon-h1
),
MODEL_TENSOR.SSM_X: (
@@ -577,7 +580,7 @@ class TensorNameMap:
MODEL_TENSOR.SSM_DT: (
"model.layers.{bid}.dt_proj", # mamba-hf
"backbone.layers.{bid}.mixer.dt_proj", # mamba
"model.layers.{bid}.mamba.dt_proj", # jamba
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1
),
MODEL_TENSOR.SSM_DT_NORM: (
@@ -587,7 +590,7 @@ class TensorNameMap:
MODEL_TENSOR.SSM_A: (
"model.layers.{bid}.A_log", # mamba-hf
"backbone.layers.{bid}.mixer.A_log", # mamba
"model.layers.{bid}.mamba.A_log", # jamba
"model.layers.{bid}.mamba.A_log", # jamba falcon-h1
),
MODEL_TENSOR.SSM_B_NORM: (
@@ -603,17 +606,18 @@ class TensorNameMap:
MODEL_TENSOR.SSM_D: (
"model.layers.{bid}.D", # mamba-hf
"backbone.layers.{bid}.mixer.D", # mamba
"model.layers.{bid}.mamba.D", # jamba
"model.layers.{bid}.mamba.D", # jamba falcon-h1
),
MODEL_TENSOR.SSM_NORM: (
"model.layers.{bid}.mamba.norm", # falcon-h1
"backbone.layers.{bid}.mixer.norm", # mamba2
),
MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj", # mamba-hf
"backbone.layers.{bid}.mixer.out_proj", # mamba
"model.layers.{bid}.mamba.out_proj", # jamba
"model.layers.{bid}.mamba.out_proj", # jamba falcon-h1
),
MODEL_TENSOR.TIME_MIX_W0: (