From 92bb84f775215cf36e3224708a9a93e2531a0a96 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 5 Nov 2025 14:26:49 +0100 Subject: [PATCH] mtmd: allow QwenVL to process larger image by default (#17020) --- tools/mtmd/clip.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 67a8a470c8..16781fb195 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2791,14 +2791,8 @@ struct clip_model_loader { get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json - // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens - // but we set a lower value to avoid OOM - // TODO: make it configurable by user - // TODO (2): bbox coordinates become inaccurate with small number of tokens, - // therefore we need to increase the min_tokens - // see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858 - hparams.set_limit_image_tokens(8, 2048); - hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + hparams.set_limit_image_tokens(8, 4096); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; if (hparams.image_min_pixels < warn_min_pixels) { LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__); @@ -4814,7 +4808,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN3VL: { - const int merge_ratio = 2; + const int merge_ratio = hparams.n_merge; const int pw = image_size_width / patch_size; const int ph = image_size_height / patch_size; std::vector positions(n_pos * 4);