mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Merge branch 'master' into compilade/mamba2
This commit is contained in:
		| @@ -87,6 +87,9 @@ class TensorNameMap: | ||||
|             "rope.freqs",  # llama-pth | ||||
|             "rotary_pos_emb.inv_freq",  # chatglm | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.ROPE_FACTORS_LONG: (), | ||||
|         MODEL_TENSOR.ROPE_FACTORS_SHORT: (), | ||||
|     } | ||||
|  | ||||
|     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { | ||||
| @@ -251,11 +254,12 @@ class TensorNameMap: | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.FFN_GATE_INP: ( | ||||
|             "layers.{bid}.feed_forward.gate",             # mixtral | ||||
|             "model.layers.{bid}.block_sparse_moe.gate",   # mixtral | ||||
|             "model.layers.{bid}.mlp.gate",                # qwen2moe olmoe | ||||
|             "transformer.decoder_layer.{bid}.router",     # Grok | ||||
|             "transformer.blocks.{bid}.ffn.router.layer",  # dbrx | ||||
|             "layers.{bid}.feed_forward.gate",                   # mixtral | ||||
|             "model.layers.{bid}.block_sparse_moe.gate",         # mixtral | ||||
|             "model.layers.{bid}.mlp.gate",                      # qwen2moe olmoe | ||||
|             "transformer.decoder_layer.{bid}.router",           # Grok | ||||
|             "transformer.blocks.{bid}.ffn.router.layer",        # dbrx | ||||
|             "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( | ||||
| @@ -364,10 +368,11 @@ class TensorNameMap: | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.FFN_DOWN_EXP: ( | ||||
|             "layers.{bid}.feed_forward.experts.w2",          # mixtral (merged) | ||||
|             "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged) | ||||
|             "transformer.blocks.{bid}.ffn.experts.mlp.w2",   # dbrx | ||||
|             "model.layers.{bid}.mlp.experts.down_proj",      # qwen2moe olmoe (merged) | ||||
|             "layers.{bid}.feed_forward.experts.w2",              # mixtral (merged) | ||||
|             "transformer.decoder_layer.{bid}.moe.linear_1",      # Grok (merged) | ||||
|             "transformer.blocks.{bid}.ffn.experts.mlp.w2",       # dbrx | ||||
|             "model.layers.{bid}.mlp.experts.down_proj",          # qwen2moe olmoe (merged) | ||||
|             "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.FFN_DOWN_SHEXP: ( | ||||
| @@ -378,7 +383,7 @@ class TensorNameMap: | ||||
|         MODEL_TENSOR.ATTN_Q_NORM: ( | ||||
|             "language_model.encoder.layers.{bid}.self_attention.q_layernorm", | ||||
|             "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon | ||||
|             "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe | ||||
|             "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon | ||||
|             "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion | ||||
|             "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2 | ||||
|             "transformer.layers.{bid}.attn.q_norm",                           # openelm | ||||
| @@ -387,7 +392,7 @@ class TensorNameMap: | ||||
|         MODEL_TENSOR.ATTN_K_NORM: ( | ||||
|             "language_model.encoder.layers.{bid}.self_attention.k_layernorm", | ||||
|             "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon | ||||
|             "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe | ||||
|             "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon | ||||
|             "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion | ||||
|             "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2 | ||||
|             "transformer.layers.{bid}.attn.k_norm",                           # openelm | ||||
| @@ -681,6 +686,15 @@ class TensorNameMap: | ||||
|         MODEL_TENSOR.ENC_OUTPUT_NORM: ( | ||||
|             "encoder.final_layer_norm", # t5 | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.CLS: ( | ||||
|             "classifier",       # jina | ||||
|             "classifier.dense", # roberta | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.CLS_OUT: ( | ||||
|             "classifier.out_proj", # roberta | ||||
|         ), | ||||
|     } | ||||
|  | ||||
|     # architecture-specific block mappings | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Francis Couture-Harpin
					Francis Couture-Harpin