mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
mtmd : support home-cooked Mistral Small Omni (#14928)
This commit is contained in:
@@ -30,6 +30,7 @@
|
|||||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||||
|
|
||||||
// vision-specific
|
// vision-specific
|
||||||
|
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
|
||||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||||
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
|
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
|
||||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||||
@@ -48,6 +49,7 @@
|
|||||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||||
|
|
||||||
// audio-specific
|
// audio-specific
|
||||||
|
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||||
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
|
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
|
||||||
|
|
||||||
|
|||||||
@@ -2221,15 +2221,27 @@ struct clip_model_loader {
|
|||||||
// projector type
|
// projector type
|
||||||
std::string proj_type;
|
std::string proj_type;
|
||||||
{
|
{
|
||||||
|
// default key
|
||||||
get_string(KEY_PROJ_TYPE, proj_type, false);
|
get_string(KEY_PROJ_TYPE, proj_type, false);
|
||||||
if (!proj_type.empty()) {
|
|
||||||
model.proj_type = clip_projector_type_from_string(proj_type);
|
// for models with mixed modalities
|
||||||
|
if (proj_type.empty()) {
|
||||||
|
if (modality == CLIP_MODALITY_VISION) {
|
||||||
|
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
|
||||||
|
} else if (modality == CLIP_MODALITY_AUDIO) {
|
||||||
|
get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("unknown modality");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
model.proj_type = clip_projector_type_from_string(proj_type);
|
||||||
|
|
||||||
if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
|
if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
|
||||||
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// correct arch for multimodal models
|
// correct arch for multimodal models (legacy method)
|
||||||
if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
|
if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
|
||||||
model.proj_type = modality == CLIP_MODALITY_VISION
|
model.proj_type = modality == CLIP_MODALITY_VISION
|
||||||
? PROJECTOR_TYPE_QWEN25VL
|
? PROJECTOR_TYPE_QWEN25VL
|
||||||
|
|||||||
Reference in New Issue
Block a user