mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	llava : add MobileVLM_V2 backup (#6175)
* Add MobileVLM_V2 backup * Update MobileVLM-README.md * Update examples/llava/MobileVLM-README.md Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/llava/convert-image-encoder-to-gguf.py Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * clip : fix whitespace * fix deifinition mistake in clip.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -1,11 +1,13 @@ | |||||||
| # MobileVLM | # MobileVLM | ||||||
|  |  | ||||||
| Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants. | Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants. | ||||||
|  |  | ||||||
| for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) | for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) | ||||||
|  |  | ||||||
| The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. | The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. | ||||||
|  |  | ||||||
|  | Notice: The overall process of model inference for both **MobilVLM** and **MobilVLM_V2** models is the same, but the process of model conversion  is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown. | ||||||
|  |  | ||||||
| ## Usage | ## Usage | ||||||
| Build with cmake or run `make llava-cli` to build it. | Build with cmake or run `make llava-cli` to build it. | ||||||
|  |  | ||||||
| @@ -34,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336 | |||||||
| python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B | python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| 3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF: | 3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: | ||||||
|  |  | ||||||
| ```sh | ```sh | ||||||
| python ./examples/llava/convert-image-encoder-to-gguf \ | python ./examples/llava/convert-image-encoder-to-gguf \ | ||||||
| @@ -44,6 +46,14 @@ python ./examples/llava/convert-image-encoder-to-gguf \ | |||||||
|     --projector-type ldp |     --projector-type ldp | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | ```sh | ||||||
|  | python ./examples/llava/convert-image-encoder-to-gguf \ | ||||||
|  |     -m path/to/clip-vit-large-patch14-336 \ | ||||||
|  |     --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ | ||||||
|  |     --output-dir path/to/MobileVLM-1.7B_V2 \ | ||||||
|  |     --projector-type ldpv2 | ||||||
|  | ``` | ||||||
|  |  | ||||||
| 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: | 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: | ||||||
|  |  | ||||||
| ```sh | ```sh | ||||||
|   | |||||||
| @@ -119,6 +119,7 @@ static std::string format(const char * fmt, ...) { | |||||||
| #define TN_LLAVA_PROJ      "mm.%d.%s" | #define TN_LLAVA_PROJ      "mm.%d.%s" | ||||||
| #define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s" | #define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s" | ||||||
| #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" | #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" | ||||||
|  | #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s" | ||||||
| #define TN_IMAGE_NEWLINE   "model.image_newline" | #define TN_IMAGE_NEWLINE   "model.image_newline" | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -126,12 +127,14 @@ enum projector_type { | |||||||
|     PROJECTOR_TYPE_MLP, |     PROJECTOR_TYPE_MLP, | ||||||
|     PROJECTOR_TYPE_MLP_NORM, |     PROJECTOR_TYPE_MLP_NORM, | ||||||
|     PROJECTOR_TYPE_LDP, |     PROJECTOR_TYPE_LDP, | ||||||
|  |     PROJECTOR_TYPE_LDPV2, | ||||||
|     PROJECTOR_TYPE_UNKNOWN, |     PROJECTOR_TYPE_UNKNOWN, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { | static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { | ||||||
|     { PROJECTOR_TYPE_MLP, "mlp" }, |     { PROJECTOR_TYPE_MLP, "mlp" }, | ||||||
|     { PROJECTOR_TYPE_LDP, "ldp" }, |     { PROJECTOR_TYPE_LDP, "ldp" }, | ||||||
|  |     { PROJECTOR_TYPE_LDPV2, "ldpv2"}, | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -475,6 +478,14 @@ struct clip_vision_model { | |||||||
|     struct ggml_tensor * mm_model_block_2_block_2_0_w; |     struct ggml_tensor * mm_model_block_2_block_2_0_w; | ||||||
|     struct ggml_tensor * mm_model_block_2_block_2_1_w; |     struct ggml_tensor * mm_model_block_2_block_2_1_w; | ||||||
|     struct ggml_tensor * mm_model_block_2_block_2_1_b; |     struct ggml_tensor * mm_model_block_2_block_2_1_b; | ||||||
|  |  | ||||||
|  |     // MobileVLM_V2 projection | ||||||
|  |     struct ggml_tensor * mm_model_mlp_0_w; | ||||||
|  |     struct ggml_tensor * mm_model_mlp_0_b; | ||||||
|  |     struct ggml_tensor * mm_model_mlp_2_w; | ||||||
|  |     struct ggml_tensor * mm_model_mlp_2_b; | ||||||
|  |     struct ggml_tensor * mm_model_peg_0_w; | ||||||
|  |     struct ggml_tensor * mm_model_peg_0_b; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| struct clip_ctx { | struct clip_ctx { | ||||||
| @@ -807,6 +818,29 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 | |||||||
|             } |             } | ||||||
|             embeddings = block_1; |             embeddings = block_1; | ||||||
|         } |         } | ||||||
|  |         else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) | ||||||
|  |         { | ||||||
|  |             int n_patch = 24; | ||||||
|  |             struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); | ||||||
|  |             mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); | ||||||
|  |             mlp_0 = ggml_gelu(ctx0, mlp_0); | ||||||
|  |             struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); | ||||||
|  |             mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); | ||||||
|  |             // mlp_2 ne = [2048, 576, 1, 1] | ||||||
|  |             // // AVG Pool Layer 2*2, strides = 2 | ||||||
|  |             mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); | ||||||
|  |             // mlp_2 ne = [576, 2048, 1, 1] | ||||||
|  |             mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); | ||||||
|  |             // mlp_2 ne [24, 24, 2048, 1] | ||||||
|  |             mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); | ||||||
|  |             // weight ne = [3, 3, 2048, 1] | ||||||
|  |             struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); | ||||||
|  |             peg_0 = ggml_add(ctx0, peg_0, mlp_2); | ||||||
|  |             peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); | ||||||
|  |             peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); | ||||||
|  |             peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); | ||||||
|  |             embeddings = peg_0; | ||||||
|  |         } | ||||||
|         else { |         else { | ||||||
|             GGML_ASSERT(false); |             GGML_ASSERT(false); | ||||||
|         } |         } | ||||||
| @@ -1177,7 +1211,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | |||||||
|             vision_model.mm_model_block_2_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); |             vision_model.mm_model_block_2_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); | ||||||
|             vision_model.mm_model_block_2_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); |             vision_model.mm_model_block_2_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); | ||||||
|             vision_model.mm_model_block_2_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); |             vision_model.mm_model_block_2_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); | ||||||
|         } else { |         } | ||||||
|  |         else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2) | ||||||
|  |         { | ||||||
|  |             // MobilVLM_V2 projection | ||||||
|  |             vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight")); | ||||||
|  |             vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias")); | ||||||
|  |             vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight")); | ||||||
|  |             vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias")); | ||||||
|  |             vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); | ||||||
|  |             vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); | ||||||
|  |         } | ||||||
|  |         else { | ||||||
|             std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; |             std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; | ||||||
|             throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); |             throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); | ||||||
|         } |         } | ||||||
| @@ -1966,6 +2011,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { | |||||||
|     if (ctx->proj_type == PROJECTOR_TYPE_LDP) { |     if (ctx->proj_type == PROJECTOR_TYPE_LDP) { | ||||||
|         return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; |         return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; | ||||||
|     } |     } | ||||||
|  |     if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { | ||||||
|  |         return ctx->vision_model.mm_model_peg_0_b->ne[0]; | ||||||
|  |     } | ||||||
|     if (ctx->proj_type == PROJECTOR_TYPE_MLP) { |     if (ctx->proj_type == PROJECTOR_TYPE_MLP) { | ||||||
|         return ctx->vision_model.mm_2_b->ne[0]; |         return ctx->vision_model.mm_2_b->ne[0]; | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -1,6 +1,7 @@ | |||||||
| import argparse | import argparse | ||||||
| import os | import os | ||||||
| import json | import json | ||||||
|  | import re | ||||||
|  |  | ||||||
| import torch | import torch | ||||||
| import numpy as np | import numpy as np | ||||||
| @@ -38,9 +39,11 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b | |||||||
| def get_tensor_name(name: str) -> str: | def get_tensor_name(name: str) -> str: | ||||||
|     if "projection" in name: |     if "projection" in name: | ||||||
|         return name |         return name | ||||||
|  |  | ||||||
|     if "mm_projector" in name: |     if "mm_projector" in name: | ||||||
|         return name.replace("model.mm_projector", "mm") |         name = name.replace("model.mm_projector", "mm") | ||||||
|  |         name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) | ||||||
|  |         name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) | ||||||
|  |         return name | ||||||
|  |  | ||||||
|     return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") |     return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") | ||||||
|  |  | ||||||
| @@ -83,7 +86,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False, | |||||||
| ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, | ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, | ||||||
|                 help="The clip model is from openclip (for ViT-SO400M type))") |                 help="The clip model is from openclip (for ViT-SO400M type))") | ||||||
| ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") | ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") | ||||||
| ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") | ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") | ||||||
| ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) | ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) | ||||||
| # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 | # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 | ||||||
| # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 | # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Ziang Wu
					Ziang Wu