mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
model : support vision LiquidAI LFM2-VL family (#15347)
* wip lfm2 vision model * Fix conv weight * Implement dynamic resolution * Fix cuda * support LFM2-VL-450M * happy CI * Remove extra `ggml_conv` and put others into the right place Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
@@ -82,6 +82,7 @@
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
||||
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
@@ -133,6 +134,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -153,6 +155,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
@@ -265,6 +265,7 @@ struct clip_model {
|
||||
|
||||
// LLaVA projection
|
||||
ggml_tensor * mm_input_norm_w = nullptr;
|
||||
ggml_tensor * mm_input_norm_b = nullptr;
|
||||
ggml_tensor * mm_0_w = nullptr;
|
||||
ggml_tensor * mm_0_b = nullptr;
|
||||
ggml_tensor * mm_2_w = nullptr;
|
||||
@@ -488,11 +489,17 @@ struct clip_graph {
|
||||
|
||||
ggml_cgraph * build_siglip() {
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
ggml_tensor * learned_pos_embd = model.position_embeddings;
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
|
||||
learned_pos_embd = resize_position_embeddings();
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
model.position_embeddings,
|
||||
learned_pos_embd,
|
||||
nullptr);
|
||||
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
|
||||
@@ -542,6 +549,45 @@ struct clip_graph {
|
||||
bsz);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
|
||||
// pixel unshuffle block
|
||||
const int scale_factor = model.hparams.proj_scale_factor;
|
||||
GGML_ASSERT(scale_factor > 1);
|
||||
|
||||
const int n_embd = cur->ne[0];
|
||||
int width = img.nx / patch_size;
|
||||
int height = img.ny / patch_size;
|
||||
|
||||
// pad width and height to factor
|
||||
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
|
||||
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
|
||||
if (pad_width || pad_height) {
|
||||
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
|
||||
width += pad_width;
|
||||
height += pad_height;
|
||||
}
|
||||
|
||||
// unshuffle h
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
|
||||
|
||||
// unshuffle w
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
||||
|
||||
// projection
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
} else {
|
||||
GGML_ABORT("SigLIP: Unsupported projector type");
|
||||
}
|
||||
@@ -1560,6 +1606,27 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
// siglip2 naflex
|
||||
ggml_tensor * resize_position_embeddings() {
|
||||
ggml_tensor * pos_embd = model.position_embeddings;
|
||||
const int height = img.ny / patch_size;
|
||||
const int width = img.nx / patch_size;
|
||||
|
||||
if (!pos_embd || height * width == pos_embd->ne[1]) {
|
||||
return pos_embd;
|
||||
}
|
||||
|
||||
const int n_pos_embd = std::sqrt(pos_embd->ne[1]);
|
||||
pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_pos_embd, n_pos_embd); // -> (n_embd, n_pos_embd, n_pos_embd)
|
||||
pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_pos_embd, n_pos_embd, n_embd)
|
||||
pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, 1); // -> (width, height, n_embd)
|
||||
pos_embd = ggml_reshape_2d(ctx0, pos_embd, height * width, n_embd); // -> (height * width, n_embd)
|
||||
pos_embd = ggml_transpose(ctx0, pos_embd); // -> (n_embd, height * width)
|
||||
pos_embd = ggml_cont(ctx0, pos_embd);
|
||||
|
||||
return pos_embd;
|
||||
}
|
||||
|
||||
// build vision transformer (ViT) cgraph
|
||||
// this function should cover most of the models
|
||||
// if your model has specific features, you should probably duplicate this function
|
||||
@@ -1966,6 +2033,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
res = graph.build_siglip();
|
||||
} break;
|
||||
@@ -2230,6 +2298,7 @@ struct clip_model_loader {
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
||||
@@ -2533,6 +2602,15 @@ struct clip_model_loader {
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
{
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
@@ -3428,6 +3506,43 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
res_imgs->grid_y = inst.grid_size.height;
|
||||
return true;
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
|
||||
GGML_ASSERT(params.proj_scale_factor);
|
||||
|
||||
// smart resize
|
||||
const int width = img->nx;
|
||||
const int height = img->ny;
|
||||
const int total_factor = params.patch_size * params.proj_scale_factor;
|
||||
constexpr int min_image_tokens = 64;
|
||||
constexpr int max_image_tokens = 256;
|
||||
const float min_pixels = min_image_tokens * total_factor * total_factor;
|
||||
const float max_pixels = max_image_tokens * total_factor * total_factor;
|
||||
|
||||
auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
|
||||
auto ceil_by_factor = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
|
||||
auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
|
||||
|
||||
int h_bar = std::max(total_factor, round_by_factor(height));
|
||||
int w_bar = std::max(total_factor, round_by_factor(width));
|
||||
|
||||
if (h_bar * w_bar > max_pixels) {
|
||||
const auto beta = std::sqrt((height * width) / max_pixels);
|
||||
h_bar = std::max(total_factor, floor_by_factor(height / beta));
|
||||
w_bar = std::max(total_factor, floor_by_factor(width / beta));
|
||||
} else if (h_bar * w_bar < min_pixels) {
|
||||
const auto beta = std::sqrt(min_pixels / (height * width));
|
||||
h_bar = ceil_by_factor(height * beta);
|
||||
w_bar = ceil_by_factor(width * beta);
|
||||
}
|
||||
|
||||
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
|
||||
|
||||
clip_image_u8 resized_img;
|
||||
image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color);
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(res));
|
||||
return true;
|
||||
}
|
||||
|
||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||
@@ -3630,6 +3745,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
n_patches_sq /= 2;
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
n_patches_sq = (img->nx / (params.patch_size * params.proj_scale_factor)) * (img->ny / (params.patch_size * params.proj_scale_factor));
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported projector type");
|
||||
}
|
||||
@@ -4034,6 +4153,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
// do nothing
|
||||
@@ -4135,6 +4255,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
default:
|
||||
GGML_ABORT("Unknown projector type");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user