From 03568c93587c971bf87b3638017a9de81357cd26 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 3 Jul 2025 15:10:18 +0400 Subject: [PATCH] fix --- src/llama-model.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4f8333cb69..fb1850b490 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14674,8 +14674,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { // Build the inputs in the recurrent & kv cache auto * inp = build_inp_mem_hybrid(); - auto * inp_attn = build_attn_inp_kv_unified(); - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -14718,7 +14716,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - ggml_tensor * attn_out = build_attn(inp_attn, gf, + ggml_tensor * attn_out = build_attn(inp, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); attn_out = ggml_scale(ctx0, attn_out, hparams.attention_out_multiplier);