mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	mtmd : add qwen2vl and qwen2.5vl (#13141)
* llava : add clip_n_output_tokens, deprecate clip_n_patches * mtmd : add qwen2vl and qwen2.5vl * decode_embd_batch::set_position_... * working version * deprecate llama-qwen2vl-cli * correct order W, H of clip_embd_nbytes_by_img * edit existing line in hot topics
This commit is contained in:
		| @@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) | |||||||
| ## Hot topics | ## Hot topics | ||||||
|  |  | ||||||
| - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9) | - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9) | ||||||
| - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated | - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated | ||||||
| - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode | - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode | ||||||
| - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639 | - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639 | ||||||
| - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim | - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim | ||||||
|   | |||||||
| @@ -64,13 +64,7 @@ endif() | |||||||
| add_executable(llama-llava-cli    deprecation-warning.cpp) | add_executable(llama-llava-cli    deprecation-warning.cpp) | ||||||
| add_executable(llama-gemma3-cli   deprecation-warning.cpp) | add_executable(llama-gemma3-cli   deprecation-warning.cpp) | ||||||
| add_executable(llama-minicpmv-cli deprecation-warning.cpp) | add_executable(llama-minicpmv-cli deprecation-warning.cpp) | ||||||
|  | add_executable(llama-qwen2vl-cli  deprecation-warning.cpp) | ||||||
| set(TARGET llama-qwen2vl-cli) |  | ||||||
| add_executable(${TARGET} qwen2vl-cli.cpp) |  | ||||||
| set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli) |  | ||||||
| install(TARGETS ${TARGET} RUNTIME) |  | ||||||
| target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) |  | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_17) |  | ||||||
|  |  | ||||||
| set(TARGET llama-mtmd-cli) | set(TARGET llama-mtmd-cli) | ||||||
| add_executable(${TARGET} mtmd-cli.cpp) | add_executable(${TARGET} mtmd-cli.cpp) | ||||||
|   | |||||||
| @@ -2825,15 +2825,18 @@ void clip_free(clip_ctx * ctx) { | |||||||
|     delete ctx; |     delete ctx; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // deprecated | ||||||
| size_t clip_embd_nbytes(const struct clip_ctx * ctx) { | size_t clip_embd_nbytes(const struct clip_ctx * ctx) { | ||||||
|     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); |     const int32_t nx = ctx->vision_model.hparams.image_size; | ||||||
|  |     const int32_t ny = ctx->vision_model.hparams.image_size; | ||||||
|  |     return clip_embd_nbytes_by_img(ctx, nx, ny); | ||||||
| } | } | ||||||
|  |  | ||||||
| size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { | size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { | ||||||
|     clip_image_f32 img; |     clip_image_f32 img; | ||||||
|     img.nx = img_w; |     img.nx = img_w; | ||||||
|     img.ny = img_h; |     img.ny = img_h; | ||||||
|     return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); |     return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); | ||||||
| } | } | ||||||
|  |  | ||||||
| int32_t clip_get_image_size(const struct clip_ctx * ctx) { | int32_t clip_get_image_size(const struct clip_ctx * ctx) { | ||||||
| @@ -2863,14 +2866,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) { | |||||||
|     return ctx->vision_model.hparams.image_grid_pinpoints.size(); |     return ctx->vision_model.hparams.image_grid_pinpoints.size(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // deprecated | ||||||
| int clip_n_patches(const struct clip_ctx * ctx) { | int clip_n_patches(const struct clip_ctx * ctx) { | ||||||
|     clip_image_f32 img; |     clip_image_f32 img; | ||||||
|     img.nx = ctx->vision_model.hparams.image_size; |     img.nx = ctx->vision_model.hparams.image_size; | ||||||
|     img.ny = ctx->vision_model.hparams.image_size; |     img.ny = ctx->vision_model.hparams.image_size; | ||||||
|     return clip_n_patches_by_img(ctx, &img); |     return clip_n_output_tokens(ctx, &img); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // deprecated | ||||||
| int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { | int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { | ||||||
|  |     return clip_n_output_tokens(ctx, img); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { | ||||||
|  |     const auto & params = ctx->vision_model.hparams; | ||||||
|  |     const int n_total = clip_n_output_tokens(ctx, img); | ||||||
|  |     if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { | ||||||
|  |         return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0); | ||||||
|  |     } | ||||||
|  |     return n_total; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { | ||||||
|  |     const auto & params = ctx->vision_model.hparams; | ||||||
|  |     if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { | ||||||
|  |         return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0); | ||||||
|  |     } | ||||||
|  |     return 1; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { | ||||||
|     const auto & params = ctx->vision_model.hparams; |     const auto & params = ctx->vision_model.hparams; | ||||||
|  |  | ||||||
|     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); |     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); | ||||||
|   | |||||||
| @@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par | |||||||
| CLIP_API void clip_free(struct clip_ctx * ctx); | CLIP_API void clip_free(struct clip_ctx * ctx); | ||||||
|  |  | ||||||
| CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); | CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); | ||||||
| CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); | CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); | ||||||
|  |  | ||||||
| CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); | CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); | ||||||
| CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); | CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); | ||||||
| @@ -59,8 +59,19 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); | |||||||
| CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); | CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); | ||||||
| CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx); | CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx); | ||||||
|  |  | ||||||
| CLIP_API int clip_n_patches        (const struct clip_ctx * ctx); | GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx), | ||||||
| CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); |     "use clip_n_output_tokens instead"); | ||||||
|  | GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img), | ||||||
|  |     "use clip_n_output_tokens instead"); | ||||||
|  |  | ||||||
|  | CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); | ||||||
|  |  | ||||||
|  | // for M-RoPE, this will be the number of token positions in X and Y directions | ||||||
|  | // for other models, X will be the total number of tokens and Y will be 1 | ||||||
|  | CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img); | ||||||
|  | CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img); | ||||||
|  |  | ||||||
|  | // this should be equal to the embedding dimension of the text model | ||||||
| CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); | CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); | ||||||
|  |  | ||||||
| CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); | CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); | ||||||
|   | |||||||
| @@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair< | |||||||
| } | } | ||||||
|  |  | ||||||
| // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) | // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) | ||||||
| static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { | static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) { | ||||||
|     struct { |     struct { | ||||||
|         struct ggml_context * ctx; |         struct ggml_context * ctx; | ||||||
|     } model; |     } model; | ||||||
| @@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> | |||||||
|  |  | ||||||
|     model.ctx = ggml_init(params); |     model.ctx = ggml_init(params); | ||||||
|  |  | ||||||
|     struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 |     struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4 | ||||||
|     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); |     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); | ||||||
|     // fill it with the image embeddings, ignoring the base |     // fill it with the image embeddings, ignoring the base | ||||||
|     for (size_t i = 1; i < num_images; i++) { |     for (size_t i = 1; i < num_images; i++) { | ||||||
| @@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> | |||||||
|  |  | ||||||
|     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context |     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context | ||||||
|     // append without newline tokens (default behavior in llava_arch when not using unpad ): |     // append without newline tokens (default behavior in llava_arch when not using unpad ): | ||||||
|     memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches |     memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches | ||||||
|     *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip)); |     *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input)); | ||||||
|  |  | ||||||
|     // Debug: Test single segments |     // Debug: Test single segments | ||||||
|     // Current findings: sending base image, sending a segment embedding all works similar to python |     // Current findings: sending base image, sending a segment embedding all works similar to python | ||||||
| @@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli | |||||||
|                 image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), |                 image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), | ||||||
|                 image_embd_v[i], |                 image_embd_v[i], | ||||||
|                 clip_embd_nbytes_by_img(ctx_clip, nx, ny)); |                 clip_embd_nbytes_by_img(ctx_clip, nx, ny)); | ||||||
|             n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res); |             n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res); | ||||||
|         } |         } | ||||||
|         *n_img_pos = n_img_pos_out; |         *n_img_pos = n_img_pos_out; | ||||||
|         for (size_t i = 0; i < image_embd_v.size(); i++) { |         for (size_t i = 0; i < image_embd_v.size(); i++) { | ||||||
| @@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli | |||||||
|     } |     } | ||||||
|     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { |     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { | ||||||
|         // flat / default llava-1.5 type embedding |         // flat / default llava-1.5 type embedding | ||||||
|         *n_img_pos = clip_n_patches(ctx_clip); |  | ||||||
|         clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); |         clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); | ||||||
|  |         *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); | ||||||
|         bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 |         bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 | ||||||
|         if (!encoded) { |         if (!encoded) { | ||||||
|             LOG_ERR("Unable to encode image\n"); |             LOG_ERR("Unable to encode image\n"); | ||||||
| @@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli | |||||||
|         struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); |         struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); | ||||||
|  |  | ||||||
|         int n_img_pos_out; |         int n_img_pos_out; | ||||||
|         clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); |         clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0); | ||||||
|  |         clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input); | ||||||
|         *n_img_pos = n_img_pos_out; |         *n_img_pos = n_img_pos_out; | ||||||
|  |  | ||||||
|         for (size_t i = 0; i < image_embd_v.size(); i++) { |         for (size_t i = 0; i < image_embd_v.size(); i++) { | ||||||
|   | |||||||
| @@ -136,39 +136,6 @@ struct mtmd_cli_context { | |||||||
|     } |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| struct decode_embd_batch { |  | ||||||
|     std::vector<llama_pos>      pos; |  | ||||||
|     std::vector<int32_t>        n_seq_id; |  | ||||||
|     std::vector<llama_seq_id>   seq_id_0; |  | ||||||
|     std::vector<llama_seq_id *> seq_ids; |  | ||||||
|     std::vector<int8_t>         logits; |  | ||||||
|     llama_batch batch; |  | ||||||
|     decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { |  | ||||||
|         pos     .resize(n_tokens); |  | ||||||
|         n_seq_id.resize(n_tokens); |  | ||||||
|         seq_ids .resize(n_tokens + 1); |  | ||||||
|         logits  .resize(n_tokens); |  | ||||||
|         seq_id_0.resize(1); |  | ||||||
|         seq_id_0[0] = seq_id; |  | ||||||
|         seq_ids [n_tokens] = nullptr; |  | ||||||
|         batch = { |  | ||||||
|             /*n_tokens       =*/ n_tokens, |  | ||||||
|             /*tokens         =*/ nullptr, |  | ||||||
|             /*embd           =*/ embd, |  | ||||||
|             /*pos            =*/ pos.data(), |  | ||||||
|             /*n_seq_id       =*/ n_seq_id.data(), |  | ||||||
|             /*seq_id         =*/ seq_ids.data(), |  | ||||||
|             /*logits         =*/ logits.data(), |  | ||||||
|         }; |  | ||||||
|         for (int i = 0; i < n_tokens; i++) { |  | ||||||
|             batch.pos     [i] = pos_0 + i; |  | ||||||
|             batch.n_seq_id[i] = 1; |  | ||||||
|             batch.seq_id  [i] = seq_id_0.data(); |  | ||||||
|             batch.logits  [i] = false; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) { | static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) { | ||||||
|     llama_tokens generated_tokens; |     llama_tokens generated_tokens; | ||||||
|     for (int i = 0; i < n_predict; i++) { |     for (int i = 0; i < n_predict; i++) { | ||||||
| @@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect | |||||||
|         return 1; |         return 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     ctx.n_past += mtmd_helper_get_n_tokens(chunks); |     ctx.n_past += mtmd_helper_get_n_pos(chunks); | ||||||
|  |  | ||||||
|     return 0; |     return 0; | ||||||
| } | } | ||||||
| @@ -371,6 +338,7 @@ int main(int argc, char ** argv) { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     if (g_is_interrupted) LOG("\nInterrupted by user\n"); |     if (g_is_interrupted) LOG("\nInterrupted by user\n"); | ||||||
|  |     LOG("\n\n"); | ||||||
|     llama_perf_context_print(ctx.lctx); |     llama_perf_context_print(ctx.lctx); | ||||||
|     return g_is_interrupted ? 130 : 0; |     return g_is_interrupted ? 130 : 0; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -40,11 +40,14 @@ struct mtmd_context { | |||||||
|     llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice |     llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice | ||||||
|     llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row |     llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row | ||||||
|  |  | ||||||
|  |     bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE | ||||||
|  |  | ||||||
|     // TODO @ngxson : add timings |     // TODO @ngxson : add timings | ||||||
|  |  | ||||||
|     mtmd_context(const char * mmproj_fname, |     mtmd_context(const char * mmproj_fname, | ||||||
|                    const llama_model * text_model, |                    const llama_model * text_model, | ||||||
|                    const mtmd_context_params & ctx_params) : |                    const mtmd_context_params & ctx_params) : | ||||||
|  |         text_model   (text_model), | ||||||
|         print_timings(ctx_params.print_timings), |         print_timings(ctx_params.print_timings), | ||||||
|         n_threads    (ctx_params.n_threads), |         n_threads    (ctx_params.n_threads), | ||||||
|         image_marker (ctx_params.image_marker) |         image_marker (ctx_params.image_marker) | ||||||
| @@ -56,9 +59,8 @@ struct mtmd_context { | |||||||
|         if (!ctx_clip) { |         if (!ctx_clip) { | ||||||
|             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); |             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); | ||||||
|         } |         } | ||||||
|         this->text_model = text_model; |  | ||||||
|  |  | ||||||
|         GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead"); |         use_mrope = clip_is_qwen2vl(ctx_clip); | ||||||
|  |  | ||||||
|         int minicpmv_version = clip_is_minicpmv(ctx_clip); |         int minicpmv_version = clip_is_minicpmv(ctx_clip); | ||||||
|         if (minicpmv_version == 2) { |         if (minicpmv_version == 2) { | ||||||
| @@ -126,6 +128,7 @@ struct mtmd_image_tokens_data { | |||||||
| struct mtmd_image_tokens { | struct mtmd_image_tokens { | ||||||
|     uint32_t nx; // number of tokens in x direction |     uint32_t nx; // number of tokens in x direction | ||||||
|     uint32_t ny; // number of tokens in y direction |     uint32_t ny; // number of tokens in y direction | ||||||
|  |     bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) | ||||||
|     uint32_t n_tokens() const { return nx * ny; } |     uint32_t n_tokens() const { return nx * ny; } | ||||||
|     clip_image_f32_batch batch_f32; // preprocessed image patches |     clip_image_f32_batch batch_f32; // preprocessed image patches | ||||||
|     std::string id; // optional user-defined ID, useful for KV cache tracking |     std::string id; // optional user-defined ID, useful for KV cache tracking | ||||||
| @@ -202,6 +205,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx, | |||||||
|         string_replace_all(prompt_modified, ctx->image_marker, marker_modified); |         string_replace_all(prompt_modified, ctx->image_marker, marker_modified); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) { | ||||||
|  |         // <|vision_start|> ... (image embeddings) ... <|vision_end|> | ||||||
|  |         marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>"; | ||||||
|  |         string_replace_all(prompt_modified, ctx->image_marker, marker_modified); | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix |     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix | ||||||
|  |  | ||||||
|     std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker); |     std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker); | ||||||
| @@ -226,7 +236,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx, | |||||||
|  |  | ||||||
|         for (auto & entry : batch_f32.entries) { |         for (auto & entry : batch_f32.entries) { | ||||||
|             mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); |             mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); | ||||||
|             image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get()); |             image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get()); | ||||||
|             image_tokens->ny = 1; |             image_tokens->ny = 1; | ||||||
|             image_tokens->batch_f32.entries.push_back(std::move(entry)); |             image_tokens->batch_f32.entries.push_back(std::move(entry)); | ||||||
|             image_tokens->id = id; |             image_tokens->id = id; | ||||||
| @@ -322,12 +332,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx, | |||||||
|             } else { |             } else { | ||||||
|                 size_t n_tokens = 0; |                 size_t n_tokens = 0; | ||||||
|                 for (const auto & entry : batch_f32.entries) { |                 for (const auto & entry : batch_f32.entries) { | ||||||
|                     n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get()); |                     n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get()); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); |                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); | ||||||
|  |                 if (ctx->use_mrope) { | ||||||
|  |                     // for Qwen2VL, we need this information for M-RoPE decoding positions | ||||||
|  |                     image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get()); | ||||||
|  |                     image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get()); | ||||||
|  |                     image_tokens->use_mrope_pos = true; | ||||||
|  |                 } else { | ||||||
|  |                     // other models, we only need the total number of tokens | ||||||
|                     image_tokens->nx = n_tokens; |                     image_tokens->nx = n_tokens; | ||||||
|                 image_tokens->ny = 1; // TODO |                     image_tokens->ny = 1; | ||||||
|  |                 } | ||||||
|                 image_tokens->batch_f32 = std::move(batch_f32); |                 image_tokens->batch_f32 = std::move(batch_f32); | ||||||
|                 image_tokens->id = bitmaps[i_img].id; // optional |                 image_tokens->id = bitmaps[i_img].id; // optional | ||||||
|  |  | ||||||
| @@ -372,6 +390,13 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { | |||||||
|     return image_tokens->id; |     return image_tokens->id; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { | ||||||
|  |     if (image_tokens->use_mrope_pos) { | ||||||
|  |         return 1; // for M-RoPE, the whole image is 1 in temporal dimension | ||||||
|  |     } | ||||||
|  |     return image_tokens->n_tokens(); | ||||||
|  | } | ||||||
|  |  | ||||||
| int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { | int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { | ||||||
|     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); |     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); | ||||||
|     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); |     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); | ||||||
| @@ -389,7 +414,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) | |||||||
|         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() |         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() | ||||||
|         const auto & entries = image_tokens->batch_f32.entries; |         const auto & entries = image_tokens->batch_f32.entries; | ||||||
|         for (size_t i = 0; i < entries.size(); i++) { |         for (size_t i = 0; i < entries.size(); i++) { | ||||||
|             int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get()); |             int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get()); | ||||||
|             ok = clip_image_encode( |             ok = clip_image_encode( | ||||||
|                 ctx->ctx_clip, |                 ctx->ctx_clip, | ||||||
|                 ctx->n_threads, |                 ctx->n_threads, | ||||||
| @@ -417,7 +442,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) { | |||||||
|         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { |         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { | ||||||
|             n_tokens += chunk.tokens_text.size(); |             n_tokens += chunk.tokens_text.size(); | ||||||
|         } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { |         } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { | ||||||
|             n_tokens += chunk.tokens_image->n_tokens(); |             n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get()); | ||||||
|         } else { |         } else { | ||||||
|             GGML_ASSERT(false && "chunk type not supported"); |             GGML_ASSERT(false && "chunk type not supported"); | ||||||
|         } |         } | ||||||
| @@ -425,22 +450,38 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) { | |||||||
|     return n_tokens; |     return n_tokens; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) { | ||||||
|  |     llama_pos n_pos = 0; | ||||||
|  |     for (auto & chunk : chunks) { | ||||||
|  |         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { | ||||||
|  |             n_pos += chunk.tokens_text.size(); | ||||||
|  |         } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { | ||||||
|  |             n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get()); | ||||||
|  |         } else { | ||||||
|  |             GGML_ASSERT(false && "chunk type not supported"); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     return n_pos; | ||||||
|  | } | ||||||
|  |  | ||||||
| // helper struct to make working with embd batch easier | // helper struct to make working with embd batch easier | ||||||
| // note: this will be removed after llama_batch_ext refactoring | // note: this will be removed after llama_batch_ext refactoring | ||||||
| struct decode_embd_batch { | struct decode_embd_batch { | ||||||
|  |     int n_pos_per_embd; | ||||||
|  |     int n_mmproj_embd; | ||||||
|     std::vector<llama_pos>      pos; |     std::vector<llama_pos>      pos; | ||||||
|  |     std::vector<llama_pos>      pos_view; // used by mrope | ||||||
|     std::vector<int32_t>        n_seq_id; |     std::vector<int32_t>        n_seq_id; | ||||||
|     std::vector<llama_seq_id>   seq_id_0; |     std::vector<llama_seq_id>   seq_id_0; | ||||||
|     std::vector<llama_seq_id *> seq_ids; |     std::vector<llama_seq_id *> seq_ids; | ||||||
|     std::vector<int8_t>         logits; |     std::vector<int8_t>         logits; | ||||||
|     llama_batch batch; |     llama_batch batch; | ||||||
|     decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { |     decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { | ||||||
|         pos     .resize(n_tokens); |         pos     .resize(n_tokens * n_pos_per_embd); | ||||||
|         n_seq_id.resize(n_tokens); |         n_seq_id.resize(n_tokens); | ||||||
|         seq_ids .resize(n_tokens + 1); |         seq_ids .resize(n_tokens + 1); | ||||||
|         logits  .resize(n_tokens); |         logits  .resize(n_tokens); | ||||||
|         seq_id_0.resize(1); |         seq_id_0.resize(1); | ||||||
|         seq_id_0[0] = seq_id; |  | ||||||
|         seq_ids [n_tokens] = nullptr; |         seq_ids [n_tokens] = nullptr; | ||||||
|         batch = { |         batch = { | ||||||
|             /*n_tokens       =*/ n_tokens, |             /*n_tokens       =*/ n_tokens, | ||||||
| @@ -451,13 +492,64 @@ struct decode_embd_batch { | |||||||
|             /*seq_id         =*/ seq_ids.data(), |             /*seq_id         =*/ seq_ids.data(), | ||||||
|             /*logits         =*/ logits.data(), |             /*logits         =*/ logits.data(), | ||||||
|         }; |         }; | ||||||
|         for (int i = 0; i < n_tokens; i++) { |     } | ||||||
|  |  | ||||||
|  |     void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) { | ||||||
|  |         seq_id_0[0] = seq_id; | ||||||
|  |         for (int i = 0; i < batch.n_tokens; i++) { | ||||||
|             batch.pos     [i] = pos_0 + i; |             batch.pos     [i] = pos_0 + i; | ||||||
|             batch.n_seq_id[i] = 1; |             batch.n_seq_id[i] = 1; | ||||||
|             batch.seq_id  [i] = seq_id_0.data(); |             batch.seq_id  [i] = seq_id_0.data(); | ||||||
|             batch.logits  [i] = false; |             batch.logits  [i] = false; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) { | ||||||
|  |         GGML_ASSERT(n_pos_per_embd == 4); | ||||||
|  |         seq_id_0[0] = seq_id; | ||||||
|  |         for (int y = 0; y < ny; y++) { | ||||||
|  |             for (int x = 0; x < nx; x++) { | ||||||
|  |                 int i = y * nx + x; | ||||||
|  |                 pos[i                     ] = pos_0; | ||||||
|  |                 pos[i + batch.n_tokens    ] = pos_0 + y; | ||||||
|  |                 pos[i + batch.n_tokens * 2] = pos_0 + x; | ||||||
|  |                 pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         for (int i = 0; i < batch.n_tokens; i++) { | ||||||
|  |             batch.n_seq_id[i] = 1; | ||||||
|  |             batch.seq_id  [i] = seq_id_0.data(); | ||||||
|  |             batch.logits  [i] = false; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     llama_batch get_view(int offset, int n_tokens) { | ||||||
|  |         llama_pos * pos_ptr; | ||||||
|  |         pos_view.clear(); | ||||||
|  |         pos_view.resize(n_tokens * n_pos_per_embd); | ||||||
|  |         if (n_pos_per_embd > 1) { | ||||||
|  |             // mrope | ||||||
|  |             // for example, with layout of src: 1234...1234...1234...1234... | ||||||
|  |             //       offset 2 will give us dst: 34...34...34...34... | ||||||
|  |             for (int i = 0; i < n_pos_per_embd; i++) { | ||||||
|  |                 auto src = pos.begin() + i * batch.n_tokens + offset; | ||||||
|  |                 pos_view.insert(pos_view.end(), src, src + n_tokens); | ||||||
|  |             } | ||||||
|  |             pos_ptr = pos_view.data(); | ||||||
|  |         } else { | ||||||
|  |             // normal | ||||||
|  |             pos_ptr = pos.data() + offset; | ||||||
|  |         } | ||||||
|  |         return { | ||||||
|  |             /*n_tokens       =*/ n_tokens, | ||||||
|  |             /*tokens         =*/ nullptr, | ||||||
|  |             /*embd           =*/ batch.embd     + offset * n_mmproj_embd, | ||||||
|  |             /*pos            =*/ pos_ptr, | ||||||
|  |             /*n_seq_id       =*/ batch.n_seq_id + offset, | ||||||
|  |             /*seq_id         =*/ batch.seq_id   + offset, | ||||||
|  |             /*logits         =*/ batch.logits   + offset, | ||||||
|  |         }; | ||||||
|  |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| int32_t mtmd_helper_eval(mtmd_context * ctx, | int32_t mtmd_helper_eval(mtmd_context * ctx, | ||||||
| @@ -470,6 +562,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, | |||||||
|     llama_pos n_past = pos0; |     llama_pos n_past = pos0; | ||||||
|     llama_batch text_batch = llama_batch_init(n_batch, 0, 1); |     llama_batch text_batch = llama_batch_init(n_batch, 0, 1); | ||||||
|     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); |     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); | ||||||
|  |     int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1; | ||||||
|  |  | ||||||
|     for (auto & chunk : chunks) { |     for (auto & chunk : chunks) { | ||||||
|         bool is_last = &chunk == &chunks.back(); |         bool is_last = &chunk == &chunks.back(); | ||||||
| @@ -517,6 +610,16 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, | |||||||
|             int32_t i_batch = 0; |             int32_t i_batch = 0; | ||||||
|             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; |             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; | ||||||
|             float * embd = mtmd_get_output_embd(ctx); |             float * embd = mtmd_get_output_embd(ctx); | ||||||
|  |             decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd); | ||||||
|  |  | ||||||
|  |             const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get()); | ||||||
|  |             const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get()); | ||||||
|  |  | ||||||
|  |             if (mtmd_decode_use_mrope(ctx)) { | ||||||
|  |                 batch_embd.set_position_mrope(n_past, nx, ny, seq_id); | ||||||
|  |             } else { | ||||||
|  |                 batch_embd.set_position_normal(n_past, seq_id); | ||||||
|  |             } | ||||||
|  |  | ||||||
|             if (mtmd_decode_use_non_causal(ctx)) { |             if (mtmd_decode_use_non_causal(ctx)) { | ||||||
|                 llama_set_causal_attn(lctx, false); |                 llama_set_causal_attn(lctx, false); | ||||||
| @@ -524,15 +627,14 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             while (i_batch < n_img_batches) { // split into batches |             while (i_batch < n_img_batches) { // split into batches | ||||||
|                 int32_t pos_offset = i_batch*n_batch; |                 int pos_offset = i_batch*n_batch; | ||||||
|                 int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset); |                 int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset); | ||||||
|                 float * embd_batch = embd + pos_offset*n_mmproj_embd; |                 llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch); | ||||||
|                 decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0); |  | ||||||
|  |  | ||||||
|                 printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch); |                 LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch); | ||||||
|  |  | ||||||
|                 int64_t t1 = ggml_time_ms(); |                 int64_t t1 = ggml_time_ms(); | ||||||
|                 ret = llama_decode(lctx, batch_img.batch); |                 ret = llama_decode(lctx, batch_embd_view); | ||||||
|                 if (ret != 0) { |                 if (ret != 0) { | ||||||
|                     LOG_ERR("failed to decode image\n"); |                     LOG_ERR("failed to decode image\n"); | ||||||
|                     llama_set_causal_attn(lctx, true); // restore causal attn |                     llama_set_causal_attn(lctx, true); // restore causal attn | ||||||
| @@ -545,9 +647,11 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, | |||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 i_batch++; |                 i_batch++; | ||||||
|                 n_past += n_tokens_batch; |  | ||||||
|             } |             } | ||||||
|  |  | ||||||
|  |             // for mrope, one image is one single **temporal** position | ||||||
|  |             n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens; | ||||||
|  |  | ||||||
|             if (mtmd_decode_use_non_causal(ctx)) { |             if (mtmd_decode_use_non_causal(ctx)) { | ||||||
|                 llama_set_causal_attn(lctx, true); |                 llama_set_causal_attn(lctx, true); | ||||||
|             } |             } | ||||||
| @@ -595,6 +699,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) { | |||||||
|     return false; |     return false; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | bool mtmd_decode_use_mrope(mtmd_context * ctx) { | ||||||
|  |     return ctx->use_mrope; | ||||||
|  | } | ||||||
|  |  | ||||||
| void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) { | void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) { | ||||||
|     mtmd_image_tokens_free(val); |     mtmd_image_tokens_free(val); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -102,6 +102,7 @@ MTMD_API size_t      mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im | |||||||
| MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); | MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); | ||||||
| MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); | MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); | ||||||
| MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens); | MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens); | ||||||
|  | MTMD_API llama_pos   mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise) | ||||||
| MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens); | MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens); | ||||||
|  |  | ||||||
| // returns 0 on success | // returns 0 on success | ||||||
| @@ -114,15 +115,21 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); | |||||||
| // whether we need to set non-causal mask before llama_decode | // whether we need to set non-causal mask before llama_decode | ||||||
| MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); | MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); | ||||||
|  |  | ||||||
|  | // whether the current model use M-RoPE for llama_decode | ||||||
|  | MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // | // | ||||||
| // helper functions (can be implemented based on other functions) | // helper functions (can be implemented based on other functions) | ||||||
| // | // | ||||||
|  |  | ||||||
| // helper to count the total number of tokens from a list of chunks, useful to keep track of n_past | // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache | ||||||
| MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks); | MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks); | ||||||
|  |  | ||||||
|  | // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past | ||||||
|  | MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks); | ||||||
|  |  | ||||||
| // helper function that automatically: | // helper function that automatically: | ||||||
| // 1. run llama_decode() on text chunks | // 1. run llama_decode() on text chunks | ||||||
| // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() | // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() | ||||||
|   | |||||||
| @@ -27,6 +27,8 @@ | |||||||
| #include <cassert> | #include <cassert> | ||||||
| #include <cmath> | #include <cmath> | ||||||
| 
 | 
 | ||||||
|  | // THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
 | ||||||
|  | // IT IS NOT A PRODUCTION CODE
 | ||||||
| 
 | 
 | ||||||
| static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, | static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, | ||||||
|                                      int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { |                                      int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { | ||||||
| @@ -54,8 +54,8 @@ add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M" | |||||||
| add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted | add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted | ||||||
| add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K" | add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K" | ||||||
| add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0" | add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0" | ||||||
| add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" | add_test "llama-mtmd-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" | ||||||
| add_test "llama-qwen2vl-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" | add_test "llama-mtmd-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" | ||||||
|  |  | ||||||
| # to test the big models, run: ./tests.sh big | # to test the big models, run: ./tests.sh big | ||||||
| add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M" | add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Xuan-Son Nguyen
					Xuan-Son Nguyen