mtmd: improve struct initialization (#16981)

This commit is contained in:
Xuan-Son Nguyen
2025-11-05 11:26:37 +01:00
committed by GitHub
parent fd2f84f468
commit 2f0c2db43e
2 changed files with 19 additions and 17 deletions

View File

@@ -2761,6 +2761,7 @@ struct clip_model_loader {
{ {
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
// TODO: verify the image_min_tokens // TODO: verify the image_min_tokens
hparams.n_merge = 1; // the original pixtral does not use patch merging
hparams.rope_theta = 10000.0f; hparams.rope_theta = 10000.0f;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(8, 1024); hparams.set_limit_image_tokens(8, 1024);

View File

@@ -101,16 +101,17 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_
} }
mtmd_context_params mtmd_context_params_default() { mtmd_context_params mtmd_context_params_default() {
mtmd_context_params params; mtmd_context_params params {
params.use_gpu = true; /* use_gpu */ true,
params.print_timings = true; /* print_timings */ true,
params.n_threads = 4; /* n_threads */ 4,
params.verbosity = GGML_LOG_LEVEL_INFO; /* verbosity */ GGML_LOG_LEVEL_INFO,
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER; /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
params.media_marker = mtmd_default_marker(); /* media_marker */ mtmd_default_marker(),
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
params.image_min_tokens = -1; /* image_min_tokens */ -1,
params.image_max_tokens = -1; /* image_max_tokens */ -1,
};
return params; return params;
} }
@@ -172,13 +173,13 @@ struct mtmd_context {
throw std::runtime_error("media_marker must not be empty"); throw std::runtime_error("media_marker must not be empty");
} }
clip_context_params ctx_clip_params; clip_context_params ctx_clip_params {
ctx_clip_params.use_gpu = ctx_params.use_gpu; /* use_gpu */ ctx_params.use_gpu,
ctx_clip_params.verbosity = ctx_params.verbosity; /* verbosity */ ctx_params.verbosity,
ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type); /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
// custom image token limits /* image_min_tokens */ ctx_params.image_min_tokens,
ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens; /* image_max_tokens */ ctx_params.image_max_tokens,
ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens; };
auto res = clip_init(mmproj_fname, ctx_clip_params); auto res = clip_init(mmproj_fname, ctx_clip_params);
ctx_v = res.ctx_v; ctx_v = res.ctx_v;