mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-08 10:07:01 +00:00
support qwen3vl series.
Co-authored-by: Thireus ☠ <Thireus@users.noreply.github.com> Co-authored-by: yairpatch <yairpatch@users.noreply.github.com> Co-authored-by: LETS-BEE <LETS-BEE@users.noreply.github.com>
This commit is contained in:
@@ -111,6 +111,7 @@ class Keys:
|
||||
EXPERTS_PER_GROUP = "{arch}.experts_per_group"
|
||||
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
|
||||
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
|
||||
NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
LOGIT_SCALE = "{arch}.logit_scale"
|
||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||
@@ -277,6 +278,7 @@ class Keys:
|
||||
USE_GELU = "clip.use_gelu"
|
||||
USE_SILU = "clip.use_silu"
|
||||
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
|
||||
DEEPSTACK_LAYERS = "clip.vision.deepstack_layers"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.vision.attention.head_count"
|
||||
@@ -350,6 +352,8 @@ class MODEL_ARCH(IntEnum):
|
||||
QWEN2VL = auto()
|
||||
QWEN3 = auto()
|
||||
QWEN3MOE = auto()
|
||||
QWEN3VL = auto()
|
||||
QWEN3VLMOE = auto()
|
||||
PHI2 = auto()
|
||||
PHI3 = auto()
|
||||
PHIMOE = auto()
|
||||
@@ -430,6 +434,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
GLM_EDGE = auto()
|
||||
MERGER = auto()
|
||||
GEMMA3 = auto()
|
||||
QWEN3VL = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -640,6 +645,9 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_RESMPL_QUERY = auto() # minicpmv
|
||||
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
|
||||
V_MM_PATCH_MERGER = auto() # mistral small 3.1
|
||||
V_DS_NORM = auto() # qwen3vl
|
||||
V_DS_FC1 = auto() # qwen3vl
|
||||
V_DS_FC2 = auto() # qwen3vl
|
||||
# audio (mtmd)
|
||||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_CONV1D = auto()
|
||||
@@ -695,6 +703,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.QWEN2VL: "qwen2vl",
|
||||
MODEL_ARCH.QWEN3: "qwen3",
|
||||
MODEL_ARCH.QWEN3MOE: "qwen3moe",
|
||||
MODEL_ARCH.QWEN3VL: "qwen3vl",
|
||||
MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe",
|
||||
MODEL_ARCH.PHI2: "phi2",
|
||||
MODEL_ARCH.PHI3: "phi3",
|
||||
MODEL_ARCH.PHIMOE: "phimoe",
|
||||
@@ -986,6 +996,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
|
||||
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
|
||||
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
|
||||
MODEL_TENSOR.V_DS_NORM: "v.deepstack.{bid}.norm",
|
||||
MODEL_TENSOR.V_DS_FC1: "v.deepstack.{bid}.fc1",
|
||||
MODEL_TENSOR.V_DS_FC2: "v.deepstack.{bid}.fc2",
|
||||
# audio (mtmd)
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||
@@ -1054,6 +1067,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_RESMPL_QUERY,
|
||||
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
|
||||
MODEL_TENSOR.V_MM_PATCH_MERGER,
|
||||
MODEL_TENSOR.V_DS_NORM,
|
||||
MODEL_TENSOR.V_DS_FC1,
|
||||
MODEL_TENSOR.V_DS_FC2,
|
||||
# audio
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.A_ENC_CONV1D,
|
||||
@@ -1495,6 +1511,40 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.QWEN3VL: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.QWEN3VLMOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.PLAMO: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@@ -3055,6 +3105,7 @@ class VisionProjectorType:
|
||||
LLAMA4 = "llama4"
|
||||
QWEN2VL = "qwen2vl_merger"
|
||||
QWEN25VL = "qwen2.5vl_merger"
|
||||
QWEN3VL = "qwen3vl_merger"
|
||||
ULTRAVOX = "ultravox"
|
||||
INTERNVL = "internvl"
|
||||
QWEN2A = "qwen2a" # audio
|
||||
|
||||
Reference in New Issue
Block a user