mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	model : Granite docling + Idefics3 preprocessing (SmolVLM) (#16206)
* feat: Add granite-docling conversion using trillion pretokenizer Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add granite-docling vocab pre enum Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use granite-docling pre Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add clip_is_idefics3 Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Allow multi-token boundary sequences for image templating Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add tiling support for idefices3 in clip.cpp This should likely be moved into llava_uhd::get_slice_instructions, but for now this avoids disrupting the logic there. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Partial support for full templating for idefics3 in mtmd There are still errors encoding some of the image chunks, but the token sequence now matches transformers _almost_ perfectly, except for the double newline before the global image which shows up as two consecutive newline tokens instead of a single double-newline token. I think this is happening because the blocks are tokenized separately then concatenated. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Fully working image preprocessing for idefics3 w/ resize and slicing Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Parse the preprocessor config's longest side and add it to the mmproj hparams Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use the longest side instead of size * scale_factor For Granite Docling, these come out to the same value, but that was just a conicidence. Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Allow batch encoding and remove clip_is_idefics3 Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Remove unnecessary conditionals for empty token vectors Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Use image_manipulation util Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * add test model --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
		| @@ -76,7 +76,7 @@ enum mtmd_slice_tmpl { | ||||
|     MTMD_SLICE_TMPL_MINICPMV_2_5, | ||||
|     MTMD_SLICE_TMPL_MINICPMV_2_6, | ||||
|     MTMD_SLICE_TMPL_LLAMA4, | ||||
|     // TODO @ngxson : add support for idefics (SmolVLM) | ||||
|     MTMD_SLICE_TMPL_IDEFICS3, | ||||
| }; | ||||
|  | ||||
| const char * mtmd_default_marker() { | ||||
| @@ -114,19 +114,22 @@ struct mtmd_context { | ||||
|     // for llava-uhd style models, we need special tokens in-between slices | ||||
|     // minicpmv calls them "slices", llama 4 calls them "tiles" | ||||
|     mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE; | ||||
|     llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image | ||||
|     llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image | ||||
|     llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices | ||||
|     llama_token tok_slices_end    = LLAMA_TOKEN_NULL; // end of all slices | ||||
|     llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start | ||||
|     llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice end | ||||
|     llama_token tok_sli_img_mid   = LLAMA_TOKEN_NULL; // between 2 slices | ||||
|     llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row | ||||
|     std::vector<llama_token> tok_ov_img_start;  // overview image | ||||
|     std::vector<llama_token> tok_ov_img_end;    // overview image | ||||
|     std::vector<llama_token> tok_slices_start;  // start of all slices | ||||
|     std::vector<llama_token> tok_slices_end;    // end of all slices | ||||
|     std::vector<llama_token> tok_sli_img_start; // single slice start | ||||
|     std::vector<llama_token> tok_sli_img_end;   // single slice end | ||||
|     std::vector<llama_token> tok_sli_img_mid;   // between 2 slices | ||||
|     std::vector<llama_token> tok_row_end;       // end of row | ||||
|     bool        tok_row_end_trail = false; | ||||
|     bool        ov_img_first      = false; | ||||
|  | ||||
|     bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE | ||||
|  | ||||
|     // string template for slice image delimiters with row/col (idefics3) | ||||
|     std::string sli_img_start_tmpl; | ||||
|  | ||||
|     // for whisper, we pre-calculate the mel filter bank | ||||
|     whisper_preprocessor::whisper_filters w_filters; | ||||
|  | ||||
| @@ -197,13 +200,13 @@ struct mtmd_context { | ||||
|             // minicpmv 2.5 format: | ||||
|             // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice> | ||||
|             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5; | ||||
|             tok_ov_img_start  = lookup_token("<image>"); | ||||
|             tok_ov_img_end    = lookup_token("</image>"); | ||||
|             tok_slices_start  = lookup_token("<slice>"); | ||||
|             tok_slices_end    = lookup_token("</slice>"); | ||||
|             tok_ov_img_start  = {lookup_token("<image>")}; | ||||
|             tok_ov_img_end    = {lookup_token("</image>")}; | ||||
|             tok_slices_start  = {lookup_token("<slice>")}; | ||||
|             tok_slices_end    = {lookup_token("</slice>")}; | ||||
|             tok_sli_img_start = tok_ov_img_start; | ||||
|             tok_sli_img_end   = tok_ov_img_end; | ||||
|             tok_row_end       = lookup_token("\n"); | ||||
|             tok_row_end       = {lookup_token("\n")}; | ||||
|             tok_row_end_trail = false; // no trailing end-of-row token | ||||
|             ov_img_first      = true; | ||||
|  | ||||
| @@ -211,11 +214,11 @@ struct mtmd_context { | ||||
|             // minicpmv 2.6 format: | ||||
|             // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ... | ||||
|             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6; | ||||
|             tok_ov_img_start  = lookup_token("<image>"); | ||||
|             tok_ov_img_end    = lookup_token("</image>"); | ||||
|             tok_sli_img_start = lookup_token("<slice>"); | ||||
|             tok_sli_img_end   = lookup_token("</slice>"); | ||||
|             tok_row_end       = lookup_token("\n"); | ||||
|             tok_ov_img_start  = {lookup_token("<image>")}; | ||||
|             tok_ov_img_end    = {lookup_token("</image>")}; | ||||
|             tok_sli_img_start = {lookup_token("<slice>")}; | ||||
|             tok_sli_img_end   = {lookup_token("</slice>")}; | ||||
|             tok_row_end       = {lookup_token("\n")}; | ||||
|             tok_row_end_trail = false; // no trailing end-of-row token | ||||
|             ov_img_first      = true; | ||||
|  | ||||
| @@ -230,9 +233,9 @@ struct mtmd_context { | ||||
|             // <|image|> (overview)           <-- overview image is last | ||||
|             // <|image_end|> | ||||
|             slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4; | ||||
|             tok_ov_img_start  = lookup_token("<|image|>"); | ||||
|             tok_sli_img_mid   = lookup_token("<|tile_x_separator|>"); | ||||
|             tok_row_end       = lookup_token("<|tile_y_separator|>"); | ||||
|             tok_ov_img_start  = {lookup_token("<|image|>")}; | ||||
|             tok_sli_img_mid   = {lookup_token("<|tile_x_separator|>")}; | ||||
|             tok_row_end       = {lookup_token("<|tile_y_separator|>")}; | ||||
|             tok_row_end_trail = true; // add trailing end-of-row token | ||||
|             ov_img_first      = false; // overview image is last | ||||
|         } | ||||
| @@ -245,8 +248,12 @@ struct mtmd_context { | ||||
|  | ||||
|         } else if (proj == PROJECTOR_TYPE_IDEFICS3) { | ||||
|             // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 | ||||
|             img_beg = "<fake_token_around_image><global-img>"; | ||||
|             img_end = "<fake_token_around_image>"; | ||||
|             slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3; | ||||
|             tok_ov_img_start   = {lookup_token("\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")}; | ||||
|             tok_ov_img_end     = {lookup_token("<fake_token_around_image>")}; | ||||
|             tok_row_end        = {lookup_token("\n")}; | ||||
|             img_beg            = "<fake_token_around_image>"; | ||||
|             sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>"; | ||||
|  | ||||
|         } else if (proj == PROJECTOR_TYPE_PIXTRAL) { | ||||
|             // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md | ||||
| @@ -504,6 +511,7 @@ struct mtmd_tokenizer { | ||||
|                 ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 | ||||
|                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6 | ||||
|                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4 | ||||
|                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3 | ||||
|             ) { | ||||
|                 const int n_col = batch_f32.grid_x; | ||||
|                 const int n_row = batch_f32.grid_y; | ||||
| @@ -517,53 +525,45 @@ struct mtmd_tokenizer { | ||||
|  | ||||
|                 // add overview image (first) | ||||
|                 if (ctx->ov_img_first) { | ||||
|                     if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) { | ||||
|                         add_text({ctx->tok_ov_img_start}); | ||||
|                     } | ||||
|                     add_text(ctx->tok_ov_img_start); | ||||
|                     cur.entries.emplace_back(std::move(ov_chunk)); | ||||
|                     if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) { | ||||
|                         add_text({ctx->tok_ov_img_end}); | ||||
|                     } | ||||
|                     add_text(ctx->tok_ov_img_end); | ||||
|                 } | ||||
|  | ||||
|                 // add slices (or tiles) | ||||
|                 if (!chunks.empty()) { | ||||
|                     GGML_ASSERT((int)chunks.size() == n_row * n_col); | ||||
|                     if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) { | ||||
|                         add_text({ctx->tok_slices_start}); | ||||
|                     } | ||||
|                     add_text(ctx->tok_slices_start); | ||||
|                     for (int y = 0; y < n_row; y++) { | ||||
|                         for (int x = 0; x < n_col; x++) { | ||||
|                             const bool is_last_in_row = (x == n_col - 1); | ||||
|                             if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) { | ||||
|                                 add_text({ctx->tok_sli_img_start}); | ||||
|                             if (!ctx->tok_sli_img_start.empty()) { | ||||
|                                 add_text(ctx->tok_sli_img_start); | ||||
|                             } else if (!ctx->sli_img_start_tmpl.empty()) { | ||||
|                                 // If using a template to preceed a slice image | ||||
|                                 const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1; | ||||
|                                 std::unique_ptr<char[]> buf(new char[sz]); | ||||
|                                 std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1); | ||||
|                                 add_text(std::string(buf.get(), buf.get() + sz - 1), true); | ||||
|                             } | ||||
|                             cur.entries.emplace_back(std::move(chunks[y * n_col + x])); | ||||
|                             if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) { | ||||
|                                 add_text({ctx->tok_sli_img_end}); | ||||
|                             } | ||||
|                             if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) { | ||||
|                                 add_text({ctx->tok_sli_img_mid}); | ||||
|                             add_text(ctx->tok_sli_img_end); | ||||
|                             if (!is_last_in_row) { | ||||
|                                 add_text(ctx->tok_sli_img_mid); | ||||
|                             } | ||||
|                         } | ||||
|                         if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) { | ||||
|                             add_text({ctx->tok_row_end}); | ||||
|                         if ((y != n_row - 1 || ctx->tok_row_end_trail)) { | ||||
|                             add_text(ctx->tok_row_end); | ||||
|                         } | ||||
|                     } | ||||
|                     if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) { | ||||
|                         add_text({ctx->tok_slices_end}); | ||||
|                     } | ||||
|                     add_text(ctx->tok_slices_end); | ||||
|                 } | ||||
|  | ||||
|                 // add overview image (last) | ||||
|                 if (!ctx->ov_img_first) { | ||||
|                     if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) { | ||||
|                         add_text({ctx->tok_ov_img_start}); | ||||
|                     } | ||||
|                     add_text(ctx->tok_ov_img_start); | ||||
|                     cur.entries.emplace_back(std::move(ov_chunk)); | ||||
|                     if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) { | ||||
|                         add_text({ctx->tok_ov_img_end}); | ||||
|                     } | ||||
|                     add_text(ctx->tok_ov_img_end); | ||||
|                 } | ||||
|  | ||||
|             } else { | ||||
| @@ -780,7 +780,9 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) | ||||
|     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); | ||||
|     bool ok = false; | ||||
|  | ||||
|     if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) { | ||||
|     if (clip_is_llava(ctx_clip) | ||||
|         || clip_is_minicpmv(ctx_clip) | ||||
|         || clip_is_glm(ctx_clip)) { | ||||
|         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() | ||||
|         const auto & entries = image_tokens->batch_f32.entries; | ||||
|         for (size_t i = 0; i < entries.size(); i++) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Gabe Goodhart
					Gabe Goodhart