mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	convert : converting mmproj for Qwen2/2.5VL from convert_hf_to_gguf (#13209)
* wip
* qwen2.5vl ok
* vision: fix models missing "text_config"
* add test
* fix test repo name
* fix 32B model
* Revert "fix 32B model"
This reverts commit 651752f1ae.
* clarify about 32B
* rm qwen surgery script
* update llava/readme
* move V_ENC_EMBD_PATCH handling to Qwen2VLVisionModel
			
			
This commit is contained in:
		| @@ -896,6 +896,7 @@ class TensorNameMap: | ||||
|  | ||||
|         MODEL_TENSOR.V_MMPROJ: ( | ||||
|             "multi_modal_projector.linear_{bid}", | ||||
|             "visual.merger.mlp.{bid}", # qwen2vl | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_MMPROJ_FC: ( | ||||
| @@ -919,6 +920,7 @@ class TensorNameMap: | ||||
|             "vpm.embeddings.patch_embedding", | ||||
|             "model.vision_model.embeddings.patch_embedding", # SmolVLM | ||||
|             "vision_tower.patch_conv", # pixtral | ||||
|             "visual.patch_embed.proj", # qwen2vl | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_ENC_EMBD_POS: ( | ||||
| @@ -932,6 +934,7 @@ class TensorNameMap: | ||||
|             "vpm.encoder.layers.{bid}.self_attn.q_proj", | ||||
|             "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM | ||||
|             "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral | ||||
|             "visual.blocks.{bid}.attn.q", # qwen2vl, generated | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_ENC_ATTN_K: ( | ||||
| @@ -939,6 +942,7 @@ class TensorNameMap: | ||||
|             "vpm.encoder.layers.{bid}.self_attn.k_proj", | ||||
|             "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM | ||||
|             "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral | ||||
|             "visual.blocks.{bid}.attn.k", # qwen2vl, generated | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_ENC_ATTN_V: ( | ||||
| @@ -946,6 +950,7 @@ class TensorNameMap: | ||||
|             "vpm.encoder.layers.{bid}.self_attn.v_proj", | ||||
|             "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM | ||||
|             "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral | ||||
|             "visual.blocks.{bid}.attn.v", # qwen2vl, generated | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_ENC_INPUT_NORM: ( | ||||
| @@ -953,6 +958,7 @@ class TensorNameMap: | ||||
|             "vpm.encoder.layers.{bid}.layer_norm1", | ||||
|             "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM | ||||
|             "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral | ||||
|             "visual.blocks.{bid}.norm1", # qwen2vl | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_ENC_OUTPUT: ( | ||||
| @@ -960,6 +966,7 @@ class TensorNameMap: | ||||
|             "vpm.encoder.layers.{bid}.self_attn.out_proj", | ||||
|             "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM | ||||
|             "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral | ||||
|             "visual.blocks.{bid}.attn.proj", # qwen2vl | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( | ||||
| @@ -967,17 +974,24 @@ class TensorNameMap: | ||||
|             "vpm.encoder.layers.{bid}.layer_norm2", | ||||
|             "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM | ||||
|             "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral | ||||
|             "visual.blocks.{bid}.norm2", # qwen2vl | ||||
|         ), | ||||
|  | ||||
|         # some namings are messed up because the original llava code swapped fc1 and fc2 | ||||
|         # we have no better way to fix it, just be careful | ||||
|         # new models like pixtral use the correct naming | ||||
|         MODEL_TENSOR.V_ENC_FFN_UP: ( | ||||
|             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", | ||||
|             "vpm.encoder.layers.{bid}.mlp.fc1", | ||||
|             "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped) | ||||
|             "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral | ||||
|             "visual.blocks.{bid}.mlp.fc2", # qwen2vl | ||||
|             "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_ENC_FFN_GATE: ( | ||||
|             "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral | ||||
|             "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_ENC_FFN_DOWN: ( | ||||
| @@ -985,6 +999,8 @@ class TensorNameMap: | ||||
|             "vpm.encoder.layers.{bid}.mlp.fc2", | ||||
|             "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped) | ||||
|             "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral | ||||
|             "visual.blocks.{bid}.mlp.fc1", # qwen2vl | ||||
|             "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_PRE_NORM: ( | ||||
| @@ -995,6 +1011,7 @@ class TensorNameMap: | ||||
|         MODEL_TENSOR.V_POST_NORM: ( | ||||
|             "vision_tower.vision_model.post_layernorm", | ||||
|             "model.vision_model.post_layernorm", # SmolVLM | ||||
|             "visual.merger.ln_q", # qwen2vl | ||||
|         ), | ||||
|  | ||||
|         MODEL_TENSOR.V_MM_INP_PROJ: ( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan-Son Nguyen
					Xuan-Son Nguyen