diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index 99962af111..99aead5328 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -19,6 +17,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -67,9 +67,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap } if (il == n_layer - 1) { // skip computing output for unused tokens - ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp index 3c0c0eecf5..96596709ee 100644 --- a/src/models/openai-moe-iswa.cpp +++ b/src/models/openai-moe-iswa.cpp @@ -11,6 +11,8 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, auto * inp_attn = build_attn_inp_kv_iswa(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -69,7 +71,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, } if (il == n_layer - 1) { // skip computing output for unused tokens - ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); }