mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
model : Granite docling + Idefics3 preprocessing (SmolVLM) (#16206)
* feat: Add granite-docling conversion using trillion pretokenizer Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add granite-docling vocab pre enum Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use granite-docling pre Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add clip_is_idefics3 Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Allow multi-token boundary sequences for image templating Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add tiling support for idefices3 in clip.cpp This should likely be moved into llava_uhd::get_slice_instructions, but for now this avoids disrupting the logic there. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Partial support for full templating for idefics3 in mtmd There are still errors encoding some of the image chunks, but the token sequence now matches transformers _almost_ perfectly, except for the double newline before the global image which shows up as two consecutive newline tokens instead of a single double-newline token. I think this is happening because the blocks are tokenized separately then concatenated. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Fully working image preprocessing for idefics3 w/ resize and slicing Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Parse the preprocessor config's longest side and add it to the mmproj hparams Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use the longest side instead of size * scale_factor For Granite Docling, these come out to the same value, but that was just a conicidence. Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Allow batch encoding and remove clip_is_idefics3 Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Remove unnecessary conditionals for empty token vectors Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Use image_manipulation util Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * add test model --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
@@ -170,7 +170,9 @@ struct clip_hparams {
|
||||
int32_t projection_dim;
|
||||
int32_t n_head;
|
||||
int32_t n_layer;
|
||||
int32_t proj_scale_factor = 0; // idefics3
|
||||
// idefics3
|
||||
int32_t preproc_image_size = 0;
|
||||
int32_t proj_scale_factor = 0;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
@@ -2250,6 +2252,7 @@ struct clip_model_loader {
|
||||
|
||||
if (is_vision) {
|
||||
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
|
||||
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
|
||||
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
||||
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
||||
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
||||
@@ -3551,10 +3554,51 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
// res_imgs->data[0] = *res;
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
return true;
|
||||
}
|
||||
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// The refined size has two steps:
|
||||
// 1. Resize w/ aspect-ratio preserving such that the longer side is
|
||||
// the preprocessor longest size
|
||||
// 2. Resize w/out preserving aspect ratio such that both sides are
|
||||
// multiples of image_size (always rounding up)
|
||||
//
|
||||
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
|
||||
const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
|
||||
original_size, params.image_size, params.preproc_image_size);
|
||||
|
||||
llava_uhd::slice_instructions instructions;
|
||||
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
|
||||
instructions.refined_size = refined_size;
|
||||
instructions.grid_size = clip_image_size{
|
||||
static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
|
||||
static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
|
||||
};
|
||||
for (int y = 0; y < refined_size.height; y += params.image_size) {
|
||||
for (int x = 0; x < refined_size.width; x += params.image_size) {
|
||||
instructions.slices.push_back(llava_uhd::slice_coordinates{
|
||||
/* x */x,
|
||||
/* y */y,
|
||||
/* size */clip_image_size{
|
||||
std::min(params.image_size, refined_size.width - x),
|
||||
std::min(params.image_size, refined_size.height - y)
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
auto imgs = llava_uhd::slice_image(img, instructions);
|
||||
|
||||
// cast and normalize to f32
|
||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(res));
|
||||
}
|
||||
|
||||
res_imgs->grid_x = instructions.grid_size.width;
|
||||
res_imgs->grid_y = instructions.grid_size.height;
|
||||
return true;
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
|
||||
) {
|
||||
clip_image_u8 resized_image;
|
||||
|
||||
Reference in New Issue
Block a user