diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ce83f24695..15bb1cf233 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3741,6 +3741,28 @@ class Qwen3MoeModel(Qwen2MoeModel): super().set_vocab() +@ModelBase.register("Qwen3NextForCausalLM") +class Qwen3NextModel(Qwen3MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3NEXT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["linear_conv_kernel_dim"])) + self.gguf_writer.add_ssm_state_size(self.find_hparam(["linear_key_head_dim"])) + self.gguf_writer.add_ssm_group_count(self.find_hparam(["linear_num_key_heads"])) + self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["linear_num_value_heads"])) + self.gguf_writer.add_ssm_inner_size(self.find_hparam(["hidden_size"]) * (self.find_hparam(["linear_num_value_heads"]) // self.find_hparam(["linear_num_key_heads"]))) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + elif name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + elif "conv1d" in name: + data_torch = data_torch.squeeze() + + return Qwen2MoeModel.modify_tensors(self, data_torch, name, bid) + @ModelBase.register("GPT2LMHeadModel") class GPT2Model(TextModel): diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b7b472c56e..f10f41b4be 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -539,7 +539,8 @@ extern "C" { GGML_OP_RWKV_WKV6, GGML_OP_GATED_LINEAR_ATTN, GGML_OP_RWKV_WKV7, - + GGML_OP_DELTA_NET, + GGML_OP_UNARY, GGML_OP_MAP_CUSTOM1, @@ -2278,6 +2279,31 @@ extern "C" { struct ggml_tensor * state, float scale); + // Delta-Net linear layer activation + // Implements the complete Delta-Net gated linear attention mechanism + // This includes causal convolution preprocessing and gated delta rule computation + // k, v, q, g: [S, H, n_tokens, n_seqs] - key, value, query, gate tensors + // conv_weight: [conv_dim, 1, conv_kernel_size] - convolution kernel weights + // conv_bias: [conv_dim] - convolution bias (optional, can be NULL) + // beta: [H, n_tokens, n_seqs] - beta parameter for delta rule + // state: [S, S, H, n_seqs] - recurrent state tensor + // chunk_size: chunk size for chunked computation (0 for recurrent mode) + // use_qk_l2norm: whether to apply L2 normalization to query and key + // scale: attention scaling factor + GGML_API struct ggml_tensor * ggml_delta_net( + struct ggml_context * ctx, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * q, + struct ggml_tensor * g, + struct ggml_tensor * conv_weight, + struct ggml_tensor * conv_bias, + struct ggml_tensor * beta, + struct ggml_tensor * state, + int chunk_size, + bool use_qk_l2norm, + float scale); + GGML_API struct ggml_tensor * ggml_rwkv_wkv7( struct ggml_context * ctx, struct ggml_tensor * r, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c131290849..a714f7137b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1656,6 +1656,172 @@ static void ggml_compute_forward_mul_mat_id( } } +// ggml_compute_forward_delta_net + +static void ggml_compute_forward_delta_net( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; // query + const struct ggml_tensor * src1 = dst->src[1]; // key + const struct ggml_tensor * src2 = dst->src[2]; // value + const struct ggml_tensor * src3 = dst->src[3]; // gate + const struct ggml_tensor * src4 = dst->src[4]; // beta + const struct ggml_tensor * src5 = dst->src[5]; // state + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(src2->type == GGML_TYPE_F32); + GGML_ASSERT(src3->type == GGML_TYPE_F32); + GGML_ASSERT(src4->type == GGML_TYPE_F32); + GGML_ASSERT(src5->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_TENSOR_TERNARY_OP_LOCALS; + GGML_TENSOR_LOCALS(int64_t, ne3, src3, ne); + GGML_TENSOR_LOCALS(size_t, nb3, src3, nb); + GGML_TENSOR_LOCALS(int64_t, ne4, src4, ne); + GGML_TENSOR_LOCALS(size_t, nb4, src4, nb); + GGML_TENSOR_LOCALS(int64_t, ne5, src5, ne); + GGML_TENSOR_LOCALS(size_t, nb5, src5, nb); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t S = src0->ne[0]; // head dimension + const int64_t H = src0->ne[1]; // number of heads + const int64_t n_tokens = src0->ne[2]; + const int64_t n_seqs = src0->ne[3]; + + GGML_ASSERT(ne00 == S && ne01 == H && ne02 == n_tokens && ne03 == n_seqs); + GGML_ASSERT(ne10 == S && ne11 == H && ne12 == n_tokens && ne13 == n_seqs); + GGML_ASSERT(ne20 == S && ne21 == H && ne22 == n_tokens && ne23 == n_seqs); + GGML_ASSERT(ne30 == S && ne31 == H && ne32 == n_tokens && ne33 == n_seqs); + GGML_ASSERT(ne40 == H && ne41 == n_tokens && ne42 == n_seqs && ne43 == 1); + GGML_ASSERT(ne50 == S && ne51 == S && ne52 == H && ne53 == n_seqs); + + // Get operation parameters + bool use_qk_l2norm = ggml_get_op_params_i32(dst, 1) != 0; + float scale; + memcpy(&scale, ((int32_t*)dst->op_params) + 4, sizeof(float)); + + GGML_ASSERT(ne0 == S * H); + GGML_ASSERT(ne1 == n_tokens + S * n_seqs); + + // Parallelize over sequences and heads + const int64_t n_total = n_seqs * H; + const int64_t n_per_thread = (n_total + nth - 1) / nth; + const int64_t n_start = ith * n_per_thread; + const int64_t n_end = MIN(n_start + n_per_thread, n_total); + + for (int64_t n = n_start; n < n_end; ++n) { + const int64_t seq_idx = n / H; + const int64_t head_idx = n % H; + + // Get pointers to current sequence and head + float * q_ptr = (float *)((char *)src0->data + seq_idx * nb03 + head_idx * nb01); + float * k_ptr = (float *)((char *)src1->data + seq_idx * nb13 + head_idx * nb11); + float * v_ptr = (float *)((char *)src2->data + seq_idx * nb23 + head_idx * nb21); + float * g_ptr = (float *)((char *)src3->data + seq_idx * nb33 + head_idx * nb31); + float * beta_ptr = (float *)((char *)src4->data + seq_idx * nb43); + float * state_ptr = (float *)((char *)src5->data + seq_idx * nb53 + head_idx * nb51); + + float * out_ptr = (float *)((char *)dst->data + n * ne0 * sizeof(float)); + float * new_state_ptr = out_ptr + n_tokens * S; + + // Apply L2 normalization if requested + if (use_qk_l2norm) { + // Normalize query and key + for (int64_t t = 0; t < n_tokens; ++t) { + float q_sum = 0.0f, k_sum = 0.0f; + for (int64_t s = 0; s < S; ++s) { + float q_val = q_ptr[t * nb02 / sizeof(float) + s]; + float k_val = k_ptr[t * nb12 / sizeof(float) + s]; + q_sum += q_val * q_val; + k_sum += k_val * k_val; + } + float q_norm = sqrtf(q_sum + 1e-6f); + float k_norm = sqrtf(k_sum + 1e-6f); + + for (int64_t s = 0; s < S; ++s) { + q_ptr[t * nb02 / sizeof(float) + s] /= q_norm; + k_ptr[t * nb12 / sizeof(float) + s] /= k_norm; + } + } + } + + // Apply scaling to query + for (int64_t i = 0; i < n_tokens * S; ++i) { + q_ptr[i] *= scale; + } + + // Apply sigmoid to beta + float * beta_sigmoid = (float *)alloca(n_tokens * sizeof(float)); + for (int64_t t = 0; t < n_tokens; ++t) { + beta_sigmoid[t] = 1.0f / (1.0f + expf(-beta_ptr[t * nb42 / sizeof(float)])); + } + + // Complete implementation of gated delta rule + // Based on torch_recurrent_gated_delta_rule from the reference implementation + + // Process each token sequentially for recurrent computation + for (int64_t t = 0; t < n_tokens; ++t) { + // Get pointers to current token data + float * q_t = q_ptr + t * (nb02 / sizeof(float)); + float * k_t = k_ptr + t * (nb12 / sizeof(float)); + float * v_t = v_ptr + t * (nb22 / sizeof(float)); + float * g_t = g_ptr + t * (nb32 / sizeof(float)); + + // Apply exponential to gate and multiply by beta + float g_exp = expf(g_t[0]); // g is per-head, not per-dimension + float beta_t = beta_sigmoid[t]; + + // Update recurrent state: state = state * g_exp + for (int64_t i = 0; i < S * S; ++i) { + state_ptr[i] *= g_exp; + } + + // Compute kv_mem = (state * k_t^T).sum(dim=-1) + // This is a matrix-vector multiplication: state[S×S] @ k_t[S] + float kv_mem[S]; + for (int64_t i = 0; i < S; ++i) { + kv_mem[i] = 0.0f; + for (int64_t j = 0; j < S; ++j) { + kv_mem[i] += state_ptr[i * S + j] * k_t[j]; + } + } + + // Compute delta = (v_t - kv_mem) * beta_t + float delta[S]; + for (int64_t i = 0; i < S; ++i) { + delta[i] = (v_t[i] - kv_mem[i]) * beta_t; + } + + // Update state: state = state + k_t * delta^T + // This is an outer product: k_t[S] ⊗ delta[S] + for (int64_t i = 0; i < S; ++i) { + for (int64_t j = 0; j < S; ++j) { + state_ptr[i * S + j] += k_t[i] * delta[j]; + } + } + + // Compute output: out = (state * q_t^T).sum(dim=-1) + // This is a matrix-vector multiplication: state[S×S] @ q_t[S] + float * out_t = out_ptr + t * S; + for (int64_t i = 0; i < S; ++i) { + out_t[i] = 0.0f; + for (int64_t j = 0; j < S; ++j) { + out_t[i] += state_ptr[i * S + j] * q_t[j]; + } + } + } + + // Copy final state to new_state + memcpy(new_state_ptr, state_ptr, S * S * sizeof(float)); + } +} + + ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -1998,6 +2164,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_rwkv_wkv7(params, tensor); } break; + case GGML_OP_DELTA_NET: + { + ggml_compute_forward_delta_net(params, tensor); + } break; case GGML_OP_MAP_CUSTOM1: { ggml_compute_forward_map_custom1(params, tensor); @@ -2291,6 +2461,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_RWKV_WKV6: case GGML_OP_GATED_LINEAR_ATTN: case GGML_OP_RWKV_WKV7: + case GGML_OP_DELTA_NET: { n_tasks = n_threads; } break; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3584827dca..0b171ffd31 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1002,6 +1002,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "RWKV_WKV6", "GATED_LINEAR_ATTN", "RWKV_WKV7", + "DELTA_NET", "UNARY", @@ -1019,7 +1020,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90"); +static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1106,6 +1107,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rwkv_wkv6(k, v, r, tf, td, s)", "gated_linear_attn(k, v, q, gate, s)", "rwkv_wkv7(r, w, k, v, a, b, s)", + "delta_net(k, v, q, g, conv_w, conv_b, beta, state, chunk_size, use_qk_l2norm, scale)", "unary(x)", @@ -1123,7 +1125,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90"); +static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -5417,6 +5419,124 @@ struct ggml_tensor * ggml_gated_linear_attn( return result; } +// ggml_delta_net + +struct ggml_tensor * ggml_delta_net( + struct ggml_context * ctx, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * q, + struct ggml_tensor * g, + struct ggml_tensor * conv_weight, + struct ggml_tensor * conv_bias, + struct ggml_tensor * beta, + struct ggml_tensor * state, + int chunk_size, + bool use_qk_l2norm, + float scale) { + GGML_ASSERT(ggml_is_contiguous(k)); + GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(q)); + GGML_ASSERT(ggml_is_contiguous(g)); + GGML_ASSERT(ggml_is_contiguous(beta)); + GGML_ASSERT(ggml_is_contiguous(state)); + + const int64_t S = k->ne[0]; + const int64_t H = k->ne[1]; + const int64_t n_tokens = k->ne[2]; + const int64_t n_seqs = state->ne[1]; + + // Validate dimensions + GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens); + GGML_ASSERT(q->ne[0] == S && q->ne[1] == H && q->ne[2] == n_tokens); + GGML_ASSERT(g->ne[0] == S && g->ne[1] == H && g->ne[2] == n_tokens); + GGML_ASSERT(beta->ne[0] == H && beta->ne[1] == n_tokens && beta->ne[2] == n_seqs); + GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs); + + // Apply L2 normalization to query and key if requested + struct ggml_tensor * q_norm = q; + struct ggml_tensor * k_norm = k; + if (use_qk_l2norm) { + q_norm = ggml_l2_norm(ctx, q, 1e-6f); + k_norm = ggml_l2_norm(ctx, k, 1e-6f); + } + + // Apply scaling to query + q_norm = ggml_scale(ctx, q_norm, scale); + + // Apply sigmoid to beta for gating + struct ggml_tensor * beta_sigmoid = ggml_sigmoid(ctx, beta); + + // Apply causal 1D convolution preprocessing to mixed QKV + // Concatenate q, k, v along the feature dimension + int64_t concat_ne[4] = { q->ne[0], q->ne[1], q->ne[2], q->ne[3] * 3 }; + struct ggml_tensor * mixed_qkv = ggml_concat(ctx, q_norm, k_norm, 3); + mixed_qkv = ggml_concat(ctx, mixed_qkv, v, 3); + + // Transpose for convolution: [S, H, n_tokens, n_seqs*3] -> [S, n_tokens, H, n_seqs*3] + mixed_qkv = ggml_permute(ctx, mixed_qkv, 0, 2, 1, 3); + + // Apply causal 1D convolution + struct ggml_tensor * conv_out = ggml_conv_1d( + ctx, + conv_weight, + mixed_qkv, + 1, // stride + conv_weight->ne[2] - 1, // padding (kernel_size - 1) + 1 // dilation + ); + + // Apply bias if provided + if (conv_bias) { + conv_out = ggml_add(ctx, conv_out, conv_bias); + } + + // Apply SiLU activation + conv_out = ggml_silu(ctx, conv_out); + + // Transpose back: [S, n_tokens, H, n_seqs*3] -> [S, H, n_tokens, n_seqs*3] + conv_out = ggml_permute(ctx, conv_out, 0, 2, 1, 3); + + // Split the convolved output back into q, k, v components + // Split along the last dimension (3 * original size) + int64_t split_size = q->ne[3]; + struct ggml_tensor * q_conv = ggml_view_4d(ctx, conv_out, q->ne[0], q->ne[1], q->ne[2], split_size, + conv_out->nb[0], conv_out->nb[1], conv_out->nb[2], 0); + + struct ggml_tensor * k_conv = ggml_view_4d(ctx, conv_out, k->ne[0], k->ne[1], k->ne[2], split_size, + conv_out->nb[0], conv_out->nb[1], conv_out->nb[2], + split_size * ggml_type_size(q->type)); + + struct ggml_tensor * v_conv = ggml_view_4d(ctx, conv_out, v->ne[0], v->ne[1], v->ne[2], split_size, + conv_out->nb[0], conv_out->nb[1], conv_out->nb[2], + 2 * split_size * ggml_type_size(q->type)); + + // concat output and new_state + const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + // Set operation parameters for the delta rule computation + int32_t params[8] = { + chunk_size, + use_qk_l2norm ? 1 : 0, + 0, 0, // reserved + 0, 0, 0, 0 // scale and other params + }; + memcpy(params + 4, &scale, sizeof(float)); + ggml_set_op_params(result, params, sizeof(params)); + + // Use custom operation for the gated delta rule computation + result->op = GGML_OP_DELTA_NET; + result->src[0] = q_conv; + result->src[1] = k_conv; + result->src[2] = v_conv; + result->src[3] = g; + result->src[4] = beta_sigmoid; + result->src[5] = state; + + return result; +} + // ggml_rwkv_wkv7 struct ggml_tensor * ggml_rwkv_wkv7( diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7e16cbcbde..a16b26f618 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -335,6 +335,7 @@ class MODEL_ARCH(IntEnum): QWEN2VL = auto() QWEN3 = auto() QWEN3MOE = auto() + QWEN3NEXT = auto() PHI2 = auto() PHI3 = auto() PHIMOE = auto() @@ -481,6 +482,7 @@ class MODEL_TENSOR(IntEnum): SSM_D = auto() SSM_NORM = auto() SSM_OUT = auto() + SSM_BETA_ALPHA = auto() TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -671,6 +673,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.QWEN2VL: "qwen2vl", MODEL_ARCH.QWEN3: "qwen3", MODEL_ARCH.QWEN3MOE: "qwen3moe", + MODEL_ARCH.QWEN3NEXT: "qwen3next", MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PHIMOE: "phimoe", @@ -818,6 +821,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba", MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -1462,6 +1466,34 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.QWEN3NEXT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_INP_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.SSM_A, + MODEL_TENSOR.SSM_CONV1D, + MODEL_TENSOR.SSM_DT, + MODEL_TENSOR.SSM_NORM, + MODEL_TENSOR.SSM_IN, + MODEL_TENSOR.SSM_BETA_ALPHA, + MODEL_TENSOR.SSM_OUT + ], MODEL_ARCH.PLAMO: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 8fd9e454e0..77f81a36a9 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -628,10 +628,11 @@ class TensorNameMap: ), MODEL_TENSOR.SSM_IN: ( - "model.layers.{bid}.in_proj", # mamba-hf - "backbone.layers.{bid}.mixer.in_proj", # mamba - "model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid - "model.layers.layers.{bid}.mixer.in_proj", # plamo2 + "model.layers.{bid}.in_proj", # mamba-hf + "backbone.layers.{bid}.mixer.in_proj", # mamba + "model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid + "model.layers.layers.{bid}.mixer.in_proj", # plamo2 + "model.layers.{bid}.linear_attn.in_proj_qkvz", # qwen3next ), MODEL_TENSOR.SSM_CONV1D: ( @@ -639,6 +640,7 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.conv1d", # mamba "model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid "model.layers.layers.{bid}.mixer.conv1d", # plamo2 + "model.layers.{bid}.linear_attn.conv1d", # qwen3next ), MODEL_TENSOR.SSM_X: ( @@ -653,6 +655,7 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.dt_proj", # mamba "model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid "model.layers.layers.{bid}.mixer.dt_proj", # plamo2 + "model.layers.{bid}.linear_attn.dt_proj", # qwen3next ), MODEL_TENSOR.SSM_DT_NORM: ( @@ -665,6 +668,7 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.A_log", # mamba "model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid "model.layers.layers.{bid}.mixer.A_log", # plamo2 + "model.layers.{bid}.linear_attn.A_log", # qwen3next ), MODEL_TENSOR.SSM_B_NORM: ( @@ -687,17 +691,23 @@ class TensorNameMap: ), MODEL_TENSOR.SSM_NORM: ( - "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid - "backbone.layers.{bid}.mixer.norm", # mamba2 + "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid + "model.layers.{bid}.linear_attn.norm", # qwen3next + "backbone.layers.{bid}.mixer.norm", # mamba2 ), MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", # mamba-hf "backbone.layers.{bid}.mixer.out_proj", # mamba "model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid + "model.layers.{bid}.linear_attn.out_proj", # qwen3next "model.layers.layers.{bid}.mixer.out_proj", # plamo2 ), + MODEL_TENSOR.SSM_BETA_ALPHA: ( + "model.layers.{bid}.linear_attn.in_proj_ba", # qwen3next + ), + MODEL_TENSOR.TIME_MIX_W0: ( "model.layers.{bid}.attention.w0", # rwkv7 ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a4d2973ada..4c9652c3a3 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -31,6 +31,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_QWEN2VL, "qwen2vl" }, { LLM_ARCH_QWEN3, "qwen3" }, { LLM_ARCH_QWEN3MOE, "qwen3moe" }, + { LLM_ARCH_QWEN3NEXT, "qwen3next" }, { LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PHIMOE, "phimoe" }, @@ -754,6 +755,38 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, }, }, + { + LLM_ARCH_QWEN3NEXT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + }, + }, { LLM_ARCH_PHI2, { @@ -2275,6 +2308,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -2438,6 +2472,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_LFM2: case LLM_ARCH_NEMOTRON_H: + case LLM_ARCH_QWEN3NEXT: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index d181ce6784..b9abe3c096 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -35,6 +35,7 @@ enum llm_arch { LLM_ARCH_QWEN2VL, LLM_ARCH_QWEN3, LLM_ARCH_QWEN3MOE, + LLM_ARCH_QWEN3NEXT, LLM_ARCH_PHI2, LLM_ARCH_PHI3, LLM_ARCH_PHIMOE, @@ -334,6 +335,7 @@ enum llm_tensor { LLM_TENSOR_SSM_D, LLM_TENSOR_SSM_NORM, LLM_TENSOR_SSM_OUT, + LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 8182a9adf5..9d3d41c355 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -811,6 +811,7 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx } struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list & ne, size_t offset, bool required) { + LLAMA_LOG_DEBUG("%s: loading tensor %s as view\n", __func__, name.c_str()); const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); if (cur == NULL) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2be807a6a9..009e03625e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -112,6 +112,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_A13B: return "A13B"; case LLM_TYPE_21B_A3B: return "21B.A3B"; case LLM_TYPE_30B_A3B: return "30B.A3B"; + case LLM_TYPE_80B_A3B: return "80B.A3B"; case LLM_TYPE_106B_A12B: return "106B.A12B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_300B_A47B: return "300B.A47B"; @@ -1809,6 +1810,29 @@ void llama_model::load_hparams(llama_model_loader & ml) { // For Granite MoE Shared ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false); } break; + case LLM_ARCH_QWEN3NEXT: + { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + // Load linear attention (gated delta net) parameters + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + + // Mark recurrent layers (linear attention layers) + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval" + } + + switch (hparams.n_layer) { + case 80: type = LLM_TYPE_80B_A3B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_CHAMELEON: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -2360,6 +2384,76 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; + case LLM_ARCH_QWEN3NEXT: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + } + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + + // Calculate dimensions from hyperparameters + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t head_v_dim = hparams.ssm_d_state; + const int64_t n_k_heads = hparams.ssm_n_group; + const int64_t n_v_heads = hparams.ssm_dt_rank; + const int64_t key_dim = head_k_dim * n_k_heads; + const int64_t value_dim = head_v_dim * n_v_heads; + const int64_t conv_dim = key_dim * 2 + value_dim; + + // Calculate projection sizes + const int64_t qkvz_projection_size = key_dim * 2 + value_dim * 2; + const int64_t ba_projection_size = n_v_heads * 2; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + + if ((i + 1) % 4 == 0) { // TODO: magic 4 + // Attention layers + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_ff }, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + + // Q/K normalization for attention layers + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); + + } else { + // Linear attention (gated delta net) specific tensors + // Create tensors with calculated dimensions + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_projection_size }, 0); + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0); + layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_projection_size }, 0); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { n_ff, n_embd }, 0); + } + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + + // Shared experts + layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0); + } + } + break; case LLM_ARCH_LLADA: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -6075,7 +6169,8 @@ void llama_model::print_info() const { arch == LLM_ARCH_FALCON_H1 || arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_GRANITE_HYBRID || - arch == LLM_ARCH_NEMOTRON_H) { + arch == LLM_ARCH_NEMOTRON_H || + arch == LLM_ARCH_QWEN3NEXT) { LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); @@ -18827,6 +18922,329 @@ struct llm_build_smallthinker : public llm_graph_context{ } }; +struct llm_build_qwen3next : public llm_graph_context_mamba { + llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + auto * inp = build_inp_mem_hybrid(); + + ggml_tensor * inp_pos = build_inp_pos(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // Pre-norm for attention/linear attention + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // Determine layer type and build appropriate attention mechanism + if (hparams.is_recurrent(il)) { + // Linear attention layer (gated delta net) + cur = build_qwen3next_linear_attn_layer(inp->get_recr(), cur, model, ubatch, il); + } else { + // Full attention layer + cur = build_qwen3next_attention_layer( + cur, inp_pos, inp->get_attn(), model, + n_embd_head, il); + } + + // Post-attention norm + cur = build_norm(cur, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Residual connection + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "attn_residual", il); + + // FFN layer (MoE or dense) + cur = build_layer_ffn(cur, model, il); + + // Input for next layer + inpL = cur; + } + + cur = inpL; + + // Final norm + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // LM head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + +private: + ggml_tensor * build_qwen3next_attention_layer( + ggml_tensor * cur, + ggml_tensor * inp_pos, + llm_graph_input_attn_kv * inp_attn, + const llama_model & model, + const int64_t n_embd_head, + const int il) { + + // QKV projection with gating + ggml_tensor * qkv_g = build_lora_mm(model.layers[il].wq, cur); + cb(qkv_g, "qkv_g", il); + + // Split into Q and gate + const int64_t n_embd_q = hparams.n_head(il) * n_embd_head; + ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv_g, n_embd_head, hparams.n_head(il), n_tokens, + n_embd_head * sizeof(float), qkv_g->nb[1], 0); + ggml_tensor * gate = ggml_view_3d(ctx0, qkv_g, n_embd_head, hparams.n_head(il), n_tokens, + n_embd_head * sizeof(float), qkv_g->nb[1], n_embd_q * ggml_element_size(qkv_g)); + + // K and V projections + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens); + + // Apply Q/K normalization + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + + // Apply RoPE + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // Attention computation + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + cur = build_attn(inp_attn, + model.layers[il].wo, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + + // Apply gating + gate = ggml_reshape_2d(ctx0, gate, n_embd_q, n_tokens); + cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); + cb(cur, "attn_gated", il); + + return cur; + } + + ggml_tensor * build_qwen3next_linear_attn_layer(llm_graph_input_rs * inp, + ggml_tensor * cur, + const llama_model & model, + const llama_ubatch & ubatch, + int il) { + // Gated Delta Net implementation using the new ggml_delta_net function + const auto * mctx_cur = inp->mctx; + const auto kv_head = mctx_cur->get_head(); + + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t n_heads = hparams.ssm_dt_rank; + const int64_t head_dim = d_inner / n_heads; + const int64_t n_seqs = ubatch.n_seqs; + + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + // Input projection for QKV and beta/alpha + ggml_tensor * qkvz_ba = build_lora_mm(model.layers[il].ssm_in, cur); + cb(qkvz_ba, "linear_attn_in_proj", il); + + // Split into QKV and beta/alpha components + const int64_t qkv_size = d_inner * 2 + d_state * 2; + + ggml_tensor * qkv = + ggml_view_3d(ctx0, qkvz_ba, qkv_size, n_tokens, 1, qkv_size * sizeof(float), qkvz_ba->nb[1], 0); + ggml_tensor * ba = ggml_view_2d(ctx0, qkvz_ba, n_embd, n_tokens, + qkvz_ba->nb[1], qkv_size * sizeof(float)); + + // Reshape QKV for processing + qkv = ggml_reshape_3d(ctx0, qkv, head_dim, n_heads * 2 + d_state * 2 / head_dim, n_tokens); + + // Split into individual components + ggml_tensor * query = + ggml_view_3d(ctx0, qkv, head_dim, n_heads, n_tokens, head_dim * sizeof(float), qkv->nb[1], 0); + ggml_tensor * key = ggml_view_3d(ctx0, qkv, head_dim, n_heads, n_tokens, head_dim * sizeof(float), qkv->nb[1], + n_heads * head_dim * sizeof(float)); + ggml_tensor * value = ggml_view_3d(ctx0, qkv, head_dim, n_heads, n_tokens, head_dim * sizeof(float), qkv->nb[1], + n_heads * head_dim * 2 * sizeof(float)); + + // Process beta and alpha parameters (corrected dimensions) + ggml_tensor * beta_alpha = build_lora_mm(model.layers[il].ssm_beta_alpha, ba); + ggml_tensor * beta = + ggml_view_3d(ctx0, beta_alpha, n_heads, n_tokens, n_seqs, n_heads * sizeof(float), beta_alpha->nb[1], 0); + ggml_tensor * alpha = ggml_view_3d(ctx0, beta_alpha, n_heads, n_tokens, n_seqs, n_heads * sizeof(float), + beta_alpha->nb[1], n_heads * sizeof(float)); + + // Apply sigmoid to beta (exactly like reference: beta = b.sigmoid()) + beta = ggml_sigmoid(ctx0, beta); + + ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); // a + dt_bias + ggml_tensor * alpha_exp = ggml_exp(ctx0, alpha_biased); // exp(a + dt_bias) + ggml_tensor * one_tensor = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); // Create scalar tensor + one_tensor = ggml_exp(ctx0, one_tensor); // e^0 = 1 + ggml_tensor * one_plus_exp = ggml_add1(ctx0, alpha_exp, one_tensor); // 1 + exp(a + dt_bias) + ggml_tensor * alpha_softplus = ggml_log(ctx0, one_plus_exp); // log(1 + exp(...)) + ggml_tensor * A_log_exp = ggml_exp(ctx0, model.layers[il].ssm_a); // A_log.exp() + ggml_tensor * gate_scaled = ggml_mul(ctx0, alpha_softplus, A_log_exp); // A_log.exp() * softplus + ggml_tensor * gate = ggml_neg(ctx0, gate_scaled); // - (A_log.exp() * softplus) + + // Get convolution weights and bias + ggml_tensor * conv_weight = model.layers[il].ssm_conv1d; + ggml_tensor * conv_bias = nullptr; // Add if your model has conv bias + + // Get recurrent states (conv_states not needed as it's handled internally by ggml_delta_net) + ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); + + // Reshape tensors to match ggml_delta_net expectations + // [S, H, n_tokens, n_seqs] format + query = ggml_reshape_4d(ctx0, query, head_dim, n_heads, n_tokens, n_seqs); + key = ggml_reshape_4d(ctx0, key, head_dim, n_heads, n_tokens, n_seqs); + value = ggml_reshape_4d(ctx0, value, head_dim, n_heads, n_tokens, n_seqs); + + // Beta tensor + beta = ggml_reshape_3d(ctx0, beta, n_heads, n_tokens, n_seqs); + + // Get current state slice + ggml_tensor * state = ggml_view_4d(ctx0, ssm_states_all, head_dim, head_dim, n_heads, n_seqs, + ssm_states_all->nb[0], ssm_states_all->nb[1], ssm_states_all->nb[2], + kv_head * head_dim * head_dim * n_heads * ggml_element_size(ssm_states_all)); + state = ggml_cont(ctx0, state); + gate = ggml_repeat(ctx0, gate, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1, n_heads, n_tokens, n_seqs)); + + // Call the new ggml_delta_net function + ggml_tensor * output = ggml_delta_net(ctx0, + key, // k tensor + value, // v tensor + query, // q tensor + gate, // g tensor + conv_weight, // conv_weight tensor + conv_bias, // conv_bias tensor (can be nullptr) + beta, // beta tensor + state, // state tensor + 64, // chunk_size (adjust as needed) + true, // use_qk_l2norm + 1.0f // scale (adjust based on your model) + ); + cb(output, "delta_net_output", il); + + // Extract the output part (first half of the concatenated result) + ggml_tensor * attn_out = ggml_view_4d(ctx0, output, head_dim, n_heads, n_tokens, n_seqs, output->nb[0], + output->nb[1], output->nb[2], 0); + + // Extract the new state (second half of the concatenated result) + ggml_tensor * new_state = + ggml_view_4d(ctx0, output, head_dim, head_dim, n_heads, n_seqs, output->nb[0], output->nb[1], output->nb[2], + n_tokens * head_dim * n_heads * sizeof(float)); + + // Update the recurrent states + ggml_build_forward_expand( + gf, ggml_cpy(ctx0, new_state, + ggml_view_1d( + ctx0, ssm_states_all, head_dim * head_dim * n_heads * n_seqs, + kv_head * n_seqs * head_dim * head_dim * n_heads * ggml_element_size(ssm_states_all)))); + + // Apply normalization and gating + attn_out = build_norm(attn_out, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); + + // Output projection + cur = build_lora_mm(model.layers[il].wo, attn_out); + cb(cur, "linear_attn_out", il); + + // Reshape back to original dimensions + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + + return cur; + } + ggml_tensor * build_layer_ffn(ggml_tensor * cur, const llama_model & model, const int il) { + // Check if this is an MoE layer + if (model.layers[il].ffn_gate_inp != nullptr) { + // MoE branch + ggml_tensor * moe_out = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(moe_out, "ffn_moe_out", il); + + // Add shared experts if present + if (model.layers[il].ffn_up_shexp != nullptr) { + ggml_tensor * ffn_shexp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } else { + cur = moe_out; + } + } else { + // Dense FFN branch + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + // Residual connection + cur = ggml_add(ctx0, cur, cur); // This should be the residual from before FFN + cb(cur, "ffn_residual", il); + + return cur; + } +}; + + llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; @@ -19349,6 +19767,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique>(*this, params); } } break; + case LLM_ARCH_QWEN3NEXT: + { + llm = std::make_unique(*this, params); + } break; default: GGML_ABORT("fatal error"); } @@ -19524,6 +19946,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3MOE: + case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_LLADA_MOE: case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: diff --git a/src/llama-model.h b/src/llama-model.h index 10b1767f27..62ff8348f3 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -104,6 +104,7 @@ enum llm_type { LLM_TYPE_A13B, LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_30B_A3B, + LLM_TYPE_80B_A3B, // Qwen3 Next LLM_TYPE_106B_A12B, // GLM-4.5-Air LLM_TYPE_235B_A22B, LLM_TYPE_300B_A47B, // Ernie MoE big @@ -292,6 +293,9 @@ struct llama_layer { struct ggml_tensor * ssm_conv1d_b = nullptr; struct ggml_tensor * ssm_dt_b = nullptr; + // qwen3next + struct ggml_tensor * ssm_beta_alpha = nullptr; + // rwkv struct ggml_tensor * time_mix_w1 = nullptr; struct ggml_tensor * time_mix_w2 = nullptr;