diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2dd828171f..84188d16cf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14676,6 +14676,8 @@ struct llm_build_falcon_h1 : public llm_graph_context { const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + ggml_tensor * inp_out_ids = build_inp_out_ids(); + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -14740,8 +14742,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { cb(cur, "layer_out", il); if (il == n_layer - 1) { - // skip computing output for unused tokens - ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -14831,7 +14831,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur); cb(zxBCdt, "zxBCdt", il); - // check if the models has ssm_multipliers (MuP) if (hparams.ssm_has_mup) { struct ggml_tensor * mup_vec = model.layers[il].ssm_mup_vec; @@ -14850,7 +14849,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs} ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0); - // copy last (d_conv - 1) columns back into the state cache ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_ssm + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); @@ -14889,7 +14887,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { // {n_head, n_seq_tokens, n_seqs} dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b); - ggml_tensor * A = model.layers[il].ssm_a; // use the states and the indices provided by build_rs