mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-05 09:36:52 +00:00
model : add LightOnOCR-1B model (#16764)
* model : add LightOnOCR-1B model * add test
This commit is contained in:
@@ -621,7 +621,7 @@ struct clip_graph {
|
||||
}
|
||||
|
||||
// arrangement of the [IMG_BREAK] token
|
||||
{
|
||||
if (model.token_embd_img_break) {
|
||||
// not efficient, but works
|
||||
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
|
||||
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
|
||||
@@ -2095,6 +2095,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
res = graph.build_siglip();
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
res = graph.build_pixtral();
|
||||
} break;
|
||||
@@ -2380,6 +2381,7 @@ struct clip_model_loader {
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
hparams.warmup_image_size = hparams.patch_size * 8;
|
||||
@@ -2722,6 +2724,15 @@ struct clip_model_loader {
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
@@ -3622,7 +3633,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
return true;
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL) {
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR
|
||||
) {
|
||||
clip_image_u8 resized_image;
|
||||
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
|
||||
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
|
||||
@@ -3865,12 +3878,17 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
// dynamic size
|
||||
int n_merge = params.spatial_merge_size;
|
||||
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
||||
if (ctx->model.token_embd_img_break) {
|
||||
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
||||
} else {
|
||||
n_patches = n_patches_y * n_patches_x;
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
@@ -4247,6 +4265,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
// set the 2D positions
|
||||
int n_patches_per_col = image_size_width / patch_size;
|
||||
@@ -4377,6 +4396,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->model.mm_model_peg_0_b->ne[0];
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
return ctx->model.mm_3_b->ne[0];
|
||||
|
||||
Reference in New Issue
Block a user