model: add Janus Pro for image understanding (#16906)

* Add support for Janus Pro

* Update gguf-py/gguf/tensor_mapping.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update gguf-py/gguf/tensor_mapping.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Address reviewer suggestions

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Add JANUS_PRO constant

* Update clip model handling

Co-authored-by: Xuan-Son Nguyen <son@huggingface.co>

* Update tools/mtmd/clip.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

* Refactor JANUS_PRO handling in clip.cpp

Co-authored-by: Xuan-Son Nguyen <son@huggingface.co>

* Update tools/mtmd/clip.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* em whitespace

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Xuan-Son Nguyen <son@huggingface.co>
Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
Zhiyong Wang
2025-11-02 13:08:04 -08:00
committed by GitHub
parent 2f966b8ed8
commit 6b9a52422b
5 changed files with 147 additions and 1 deletions

View File

@@ -9802,6 +9802,113 @@ class CogVLMModel(LlamaModel):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("JanusForConditionalGeneration")
class JanusProModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision, aligner, and generation tensors
skip_prefixes = (
'model.vision_model.',
'model.aligner.',
'model.vqmodel.',
'model.generation_embeddings.',
'model.generation_aligner.',
'model.generation_head.',
)
if name.startswith(skip_prefixes):
return []
if name.startswith('model.language_model.'):
name = name.replace('model.language_model.', 'model.')
elif name.startswith('language_model.'):
name = name.replace('language_model.', '')
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("JanusForConditionalGeneration")
class JanusProVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
if "intermediate_size" not in self.hparams_vision:
mlp_ratio = self.hparams_vision.get("mlp_ratio")
hidden_size = self.hparams_vision.get("hidden_size")
if mlp_ratio is not None and hidden_size is not None:
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
if hidden_act == "gelu":
self.gguf_writer.add_vision_use_gelu(True)
elif hidden_act == "silu":
self.gguf_writer.add_vision_use_silu(True)
def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
"""Map aligner tensors to projector format"""
suffix = ".bias" if name.endswith(".bias") else ".weight"
if name.startswith("model.aligner."):
local_name = name[len("model.aligner."):]
elif name.startswith("aligner."):
local_name = name[len("aligner."):]
else:
raise ValueError(f"Unsupported Janus aligner prefix: {name}")
if local_name.startswith("fc1."):
mm_index = 0
elif local_name.startswith("hidden_layers."):
parts = local_name.split(".", 2)
if len(parts) < 3:
raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
mm_index = int(parts[1]) + 1
else:
raise ValueError(f"Unsupported Janus aligner tensor: {name}")
tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
return [(tensor_name, data_torch)]
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
# Skip language model tensors as they will be handled by `JanusProModel`
if name.startswith(('model.language_model.', 'language_model.')):
return []
# Skip generation-related components
skip_generation_prefixes = (
'model.vqmodel.',
'vqmodel.',
'model.generation_embeddings.',
'generation_embeddings.',
'model.generation_aligner.',
'generation_aligner.',
'model.generation_head.',
'generation_head.',
)
if name.startswith(skip_generation_prefixes):
return []
# Handle aligner tensors
if name.startswith(('model.aligner.', 'aligner.')):
return list(self._map_aligner_tensor(data_torch, name))
# Handle vision tensors
if name.startswith(('model.vision_model.', 'vision_model.')):
return [(self.map_tensor_name(name), data_torch)]
return []
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View File

@@ -3186,6 +3186,7 @@ class VisionProjectorType:
KIMIVL = "kimivl" KIMIVL = "kimivl"
LIGHTONOCR = "lightonocr" LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm" COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
# Items here are (block size, type size) # Items here are (block size, type size)

View File

@@ -1183,6 +1183,7 @@ class TensorNameMap:
"model.mm_projector.mlp.mlp.{bid}", "model.mm_projector.mlp.mlp.{bid}",
"vision_model.vision_adapter.mlp.fc{bid}", # llama 4 "vision_model.vision_adapter.mlp.fc{bid}", # llama 4
"mlp1.{bid}", # InternVL "mlp1.{bid}", # InternVL
"model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
), ),
MODEL_TENSOR.V_MMPROJ_PEG: ( MODEL_TENSOR.V_MMPROJ_PEG: (
@@ -1291,6 +1292,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.out_proj", "vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral "vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral

View File

@@ -155,6 +155,7 @@ enum projector_type {
PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_COGVLM,
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@@ -180,6 +181,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_KIMIVL, "kimivl"}, { PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
}; };
static projector_type clip_projector_type_from_string(const std::string & str) { static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@@ -588,6 +588,15 @@ struct clip_graph {
cur = ggml_gelu(ctx0, cur); cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b); cur = ggml_add(ctx0, cur, model.mm_2_b);
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
cur = build_ffn(cur,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
hparams.ffn_op,
-1);
} else { } else {
GGML_ABORT("SigLIP: Unsupported projector type"); GGML_ABORT("SigLIP: Unsupported projector type");
} }
@@ -1729,7 +1738,6 @@ struct clip_graph {
return gf; return gf;
} }
// whisper encoder with custom projector // whisper encoder with custom projector
ggml_cgraph * build_whisper_enc() { ggml_cgraph * build_whisper_enc() {
const int n_frames = img.nx; const int n_frames = img.nx;
@@ -2457,6 +2465,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{ {
res = graph.build_kimivl(); res = graph.build_kimivl();
} break; } break;
case PROJECTOR_TYPE_JANUS_PRO:
{
res = graph.build_siglip();
} break;
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
{ {
res = graph.build_cogvlm(); res = graph.build_cogvlm();
@@ -3158,6 +3170,13 @@ struct clip_model_loader {
model.mm_boi = get_tensor(TN_TOK_BOI); model.mm_boi = get_tensor(TN_TOK_BOI);
model.mm_eoi = get_tensor(TN_TOK_EOI); model.mm_eoi = get_tensor(TN_TOK_EOI);
} break; } break;
case PROJECTOR_TYPE_JANUS_PRO:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
} break;
default: default:
GGML_ASSERT(false && "unknown projector type"); GGML_ASSERT(false && "unknown projector type");
} }
@@ -4219,6 +4238,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->entries.push_back(std::move(img_f32)); res_imgs->entries.push_back(std::move(img_f32));
} break; } break;
case PROJECTOR_TYPE_JANUS_PRO:
{
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
const std::array<uint8_t, 3> pad_color = {127, 127, 127};
clip_image_u8 resized_image;
int sz = params.image_size;
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR: case PROJECTOR_TYPE_LIGHTONOCR:
{ {
@@ -4395,6 +4426,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
switch (proj) { switch (proj) {
case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM: case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_JANUS_PRO:
{ {
// do nothing // do nothing
} break; } break;
@@ -4905,6 +4937,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
{ {
// do nothing // do nothing
@@ -4993,6 +5026,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_model_mlp_3_w->ne[1]; return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_JANUS_PRO:
return ctx->model.mm_1_b->ne[0]; return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths // main path + deepstack paths