model : Granite docling + Idefics3 preprocessing (SmolVLM) (#16206)

* feat: Add granite-docling conversion using trillion pretokenizer

Branch: gabe-l-hart/GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Add granite-docling vocab pre enum

Branch: gabe-l-hart/GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Use granite-docling pre

Branch: gabe-l-hart/GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Add clip_is_idefics3

Branch: gabe-l-hart/GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Allow multi-token boundary sequences for image templating

Branch: gabe-l-hart/GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Add tiling support for idefices3 in clip.cpp

This should likely be moved into llava_uhd::get_slice_instructions, but for
now this avoids disrupting the logic there.

Branch: gabe-l-hart/GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Partial support for full templating for idefics3 in mtmd

There are still errors encoding some of the image chunks, but the token
sequence now matches transformers _almost_ perfectly, except for the double
newline before the global image which shows up as two consecutive newline
tokens instead of a single double-newline token. I think this is happening
because the blocks are tokenized separately then concatenated.

Branch: gabe-l-hart/GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Fully working image preprocessing for idefics3 w/ resize and slicing

Branch: gabe-l-hart/GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Parse the preprocessor config's longest side and add it to the mmproj hparams

Branch: GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Use the longest side instead of size * scale_factor

For Granite Docling, these come out to the same value, but that was just a
conicidence.

Branch: GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Allow batch encoding and remove clip_is_idefics3

Branch: GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* refactor: Remove unnecessary conditionals for empty token vectors

Branch: GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* refactor: Use image_manipulation util

Branch: GraniteDocling

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* add test model

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
Gabe Goodhart
2025-10-05 06:57:47 -06:00
committed by GitHub
parent 35266573b9
commit ca71fb9b36
10 changed files with 165 additions and 97 deletions

View File

@@ -76,7 +76,7 @@ enum mtmd_slice_tmpl {
MTMD_SLICE_TMPL_MINICPMV_2_5,
MTMD_SLICE_TMPL_MINICPMV_2_6,
MTMD_SLICE_TMPL_LLAMA4,
// TODO @ngxson : add support for idefics (SmolVLM)
MTMD_SLICE_TMPL_IDEFICS3,
};
const char * mtmd_default_marker() {
@@ -114,19 +114,22 @@ struct mtmd_context {
// for llava-uhd style models, we need special tokens in-between slices
// minicpmv calls them "slices", llama 4 calls them "tiles"
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
std::vector<llama_token> tok_ov_img_start; // overview image
std::vector<llama_token> tok_ov_img_end; // overview image
std::vector<llama_token> tok_slices_start; // start of all slices
std::vector<llama_token> tok_slices_end; // end of all slices
std::vector<llama_token> tok_sli_img_start; // single slice start
std::vector<llama_token> tok_sli_img_end; // single slice end
std::vector<llama_token> tok_sli_img_mid; // between 2 slices
std::vector<llama_token> tok_row_end; // end of row
bool tok_row_end_trail = false;
bool ov_img_first = false;
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
// string template for slice image delimiters with row/col (idefics3)
std::string sli_img_start_tmpl;
// for whisper, we pre-calculate the mel filter bank
whisper_preprocessor::whisper_filters w_filters;
@@ -197,13 +200,13 @@ struct mtmd_context {
// minicpmv 2.5 format:
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
tok_ov_img_start = lookup_token("<image>");
tok_ov_img_end = lookup_token("</image>");
tok_slices_start = lookup_token("<slice>");
tok_slices_end = lookup_token("</slice>");
tok_ov_img_start = {lookup_token("<image>")};
tok_ov_img_end = {lookup_token("</image>")};
tok_slices_start = {lookup_token("<slice>")};
tok_slices_end = {lookup_token("</slice>")};
tok_sli_img_start = tok_ov_img_start;
tok_sli_img_end = tok_ov_img_end;
tok_row_end = lookup_token("\n");
tok_row_end = {lookup_token("\n")};
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
@@ -211,11 +214,11 @@ struct mtmd_context {
// minicpmv 2.6 format:
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
tok_ov_img_start = lookup_token("<image>");
tok_ov_img_end = lookup_token("</image>");
tok_sli_img_start = lookup_token("<slice>");
tok_sli_img_end = lookup_token("</slice>");
tok_row_end = lookup_token("\n");
tok_ov_img_start = {lookup_token("<image>")};
tok_ov_img_end = {lookup_token("</image>")};
tok_sli_img_start = {lookup_token("<slice>")};
tok_sli_img_end = {lookup_token("</slice>")};
tok_row_end = {lookup_token("\n")};
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
@@ -230,9 +233,9 @@ struct mtmd_context {
// <|image|> (overview) <-- overview image is last
// <|image_end|>
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
tok_ov_img_start = lookup_token("<|image|>");
tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
tok_row_end = lookup_token("<|tile_y_separator|>");
tok_ov_img_start = {lookup_token("<|image|>")};
tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
tok_row_end = {lookup_token("<|tile_y_separator|>")};
tok_row_end_trail = true; // add trailing end-of-row token
ov_img_first = false; // overview image is last
}
@@ -245,8 +248,12 @@ struct mtmd_context {
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
img_beg = "<fake_token_around_image><global-img>";
img_end = "<fake_token_around_image>";
slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
tok_ov_img_start = {lookup_token("\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
tok_row_end = {lookup_token("\n")};
img_beg = "<fake_token_around_image>";
sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
@@ -504,6 +511,7 @@ struct mtmd_tokenizer {
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
) {
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
@@ -517,53 +525,45 @@ struct mtmd_tokenizer {
// add overview image (first)
if (ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_start});
}
add_text(ctx->tok_ov_img_start);
cur.entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_end});
}
add_text(ctx->tok_ov_img_end);
}
// add slices (or tiles)
if (!chunks.empty()) {
GGML_ASSERT((int)chunks.size() == n_row * n_col);
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_slices_start});
}
add_text(ctx->tok_slices_start);
for (int y = 0; y < n_row; y++) {
for (int x = 0; x < n_col; x++) {
const bool is_last_in_row = (x == n_col - 1);
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_start});
if (!ctx->tok_sli_img_start.empty()) {
add_text(ctx->tok_sli_img_start);
} else if (!ctx->sli_img_start_tmpl.empty()) {
// If using a template to preceed a slice image
const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
std::unique_ptr<char[]> buf(new char[sz]);
std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
add_text(std::string(buf.get(), buf.get() + sz - 1), true);
}
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_end});
}
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_mid});
add_text(ctx->tok_sli_img_end);
if (!is_last_in_row) {
add_text(ctx->tok_sli_img_mid);
}
}
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_row_end});
if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
add_text(ctx->tok_row_end);
}
}
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_slices_end});
}
add_text(ctx->tok_slices_end);
}
// add overview image (last)
if (!ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_start});
}
add_text(ctx->tok_ov_img_start);
cur.entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_end});
}
add_text(ctx->tok_ov_img_end);
}
} else {
@@ -780,7 +780,9 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false;
if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
if (clip_is_llava(ctx_clip)
|| clip_is_minicpmv(ctx_clip)
|| clip_is_glm(ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {