cont : migrate the rest of the inputs out of llama_context

ggml-ci
2025-11-10 10:27:03 +00:00 · 2025-02-28 18:01:25 +02:00
parent 7f02ee562e
commit 9cab53c7dd
5 changed files with 645 additions and 577 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -248,24 +248,6 @@ protected:
    virtual int64_t n_pos_per_token() const; // vision
    // when the compute graph is built, it creates the input tensors that it needs
    // the contents of the input tensors are set by the input_set() function
    // TODO: remove, replace by llama_graph_input_i->set_input()
    virtual void input_set(const llama_ubatch & ubatch);
 private:
    // TODO: remove, implement as llama_graph_input_xxx
    struct {
        // base input tensors
        ggml_tensor * pos;        // I32 [n_batch]
        ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
        ggml_tensor * out_ids;    // I32 [n_outputs]
        ggml_tensor * mean;       // F32 [n_batch, n_batch]
        ggml_tensor * cls;        // I32 [n_batch]
    } inp;
 protected:
    //
    // output
    //
@@ -309,35 +291,35 @@ public:
             ggml_tensor * cur,
              const char * name,
      const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
    // apply control vector for layer il
    ggml_tensor * build_cvec(
            ggml_context * ctx0,
             ggml_tensor * cur,
-                     int   il) override;
+                     int   il) const override;
    // do mat_mul, while optionally apply lora
    ggml_tensor * build_lora_mm(
            ggml_context * ctx0,
             ggml_tensor * w,
-             ggml_tensor * cur) override;
+             ggml_tensor * cur) const override;
    // do mat_mul_id, while optionally apply lora
    ggml_tensor * build_lora_mm_id(
            ggml_context * ctx0,
             ggml_tensor * w,   // struct ggml_tensor * as
             ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids) override;
+             ggml_tensor * ids) const override;
-    ggml_tensor * build_rope_factors(int il) override;
+    ggml_tensor * build_rope_factors(int il) const override;
    ggml_tensor * build_rope_shift(
            ggml_context * ctx0,
             ggml_tensor * cur,
             ggml_tensor * shift,
             ggml_tensor * factors,
-             ggml_backend_buffer * bbuf) override;
+             ggml_backend_buffer * bbuf) const override;
    ggml_tensor * build_inp_embd(
            llama_graph_result * res,
@@ -346,23 +328,28 @@ public:
            const llama_ubatch & ubatch) const override;
    ggml_tensor * build_inp_pos(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
    ggml_tensor * build_inp_pos_bucket(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
    ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0) override;
+      llama_graph_result * res,
            ggml_context * ctx0) const override;
    ggml_tensor * build_inp_mean(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
    ggml_tensor * build_inp_cls(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
    llama_graph_input_attn_ptr build_attn_inp(
      llama_graph_result * res,
@@ -394,18 +381,6 @@ protected:
                 bool      v_trans,
                 float     kq_scale) const;
    virtual ggml_tensor * build_inp_self_k_shift(
            ggml_context * ctx0);
    virtual void build_kv_self_shift(
            ggml_context * ctx0,
            ggml_cgraph * gf);
    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
    virtual void build_kv_self_defrag(
            ggml_context * ctx0,
            ggml_cgraph * gf);
 public:
    //
    // perf
@@ -552,19 +527,6 @@ public:
    int encode(llama_batch & inp_batch) override;
    int decode(llama_batch & inp_batch) override;
 protected:
    //
    // input
    //
    void input_set(const llama_ubatch & ubatch) override;
 private:
    struct {
        ggml_tensor * self_pos_bucket;      // I32 [n_kv, n_batch]
        ggml_tensor * self_k_shift;         // I32 [kv_size]
    } inp;
 protected:
    //
    // graph
@@ -578,8 +540,9 @@ public:
    //
    ggml_tensor * build_inp_pos_bucket(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
    llama_graph_input_attn_ptr build_attn_inp(
      llama_graph_result * res,
@@ -600,16 +563,14 @@ public:
                     int   il) const override;
 protected:
-    ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
+    llama_graph_result_ptr graph_build_kv_self_shift(
    void build_kv_self_shift(
            ggml_context * ctx0,
-            ggml_cgraph * gf) override;
+            ggml_cgraph * gf) const;
    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    void build_kv_self_defrag(
+    llama_graph_result_ptr graph_build_kv_self_defrag(
            ggml_context * ctx0,
-            ggml_cgraph * gf) override;
+            ggml_cgraph * gf) const;
    //
    // state save/load
@@ -651,19 +612,6 @@ public:
    int encode(llama_batch & inp_batch) override;
    int decode(llama_batch & inp_batch) override;
 protected:
    //
    // input
    //
    void input_set(const llama_ubatch & ubatch) override;
 private:
    struct {
        ggml_tensor * s_copy; // I32 [kv_size]
        ggml_tensor * s_mask; // F32 [1, n_kv]
    } inp;
 protected:
    //
    // graph
@@ -677,10 +625,12 @@ public:
    //
    ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0) override;
+      llama_graph_result * res,
            ggml_context * ctx0) const override;
    ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0) override;
+      llama_graph_result * res,
            ggml_context * ctx0) const override;
    ggml_tensor * build_copy_mask_state(
            ggml_context * ctx0,
@@ -689,7 +639,7 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
                 int32_t   n_state,
-                 int32_t   n_seqs) override;
+                 int32_t   n_seqs) const override;
    ggml_tensor * build_mamba_layer(
            ggml_context * ctx0,
@@ -698,7 +648,7 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
    ggml_tensor * build_rwkv_token_shift_load(
            ggml_context * ctx0,
@@ -706,13 +656,13 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
    ggml_tensor * build_rwkv_token_shift_store(
            ggml_context * ctx0,
             ggml_tensor * token_shift,
      const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
    ggml_tensor * build_rwkv6_time_mix(
            ggml_context * ctx0,
@@ -722,7 +672,7 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
 protected:
    //
@@ -774,18 +724,6 @@ public:
 protected:
    void reserve() override;
    //
    // input
    //
    void input_set(const llama_ubatch & ubatch) override;
 private:
    struct {
        ggml_tensor * cross_embd;        // F32 [n_embd, n_outputs_enc]
    } inp;
 protected:
    //
    // graph
    //
@@ -793,7 +731,8 @@ protected:
    ggml_cgraph * graph_init() override;
    ggml_tensor * build_inp_cross_embd(
-            ggml_context * ctx0) override;
+      llama_graph_result * res,
            ggml_context * ctx0) const override;
    llama_graph_input_attn_ptr build_attn_inp(
      llama_graph_result * res,
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -68,25 +68,19 @@ ggml_tensor * llama_graph_i::build_attn_cross(
 }
 ggml_tensor * llama_graph_i::build_inp_cross_embd(
-        ggml_context * ctx0) {
+      llama_graph_result * res,
            ggml_context * ctx0) const {
    GGML_UNUSED(res);
    GGML_UNUSED(ctx0);
    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
    return nullptr;
 }
 ggml_tensor * llama_graph_i::build_inp_cross_kq_mask(
        ggml_context * ctx0,
             int32_t   n_tokens) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(n_tokens);
    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
    return nullptr;
 }
 ggml_tensor * llama_graph_i::build_inp_s_copy (
-        ggml_context * ctx0) {
+      llama_graph_result * res,
            ggml_context * ctx0) const {
    GGML_UNUSED(res);
    GGML_UNUSED(ctx0);
    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@@ -95,7 +89,9 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
 }
 ggml_tensor * llama_graph_i::build_inp_s_mask(
-        ggml_context * ctx0) {
+      llama_graph_result * res,
            ggml_context * ctx0) const {
    GGML_UNUSED(res);
    GGML_UNUSED(ctx0);
    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@@ -110,7 +106,7 @@ ggml_tensor * llama_graph_i::build_copy_mask_state(
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
                 int32_t   n_state,
-             int32_t   n_seqs) {
+                 int32_t   n_seqs) const {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(s);
@@ -131,7 +127,7 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                 int   il) {
+                     int   il) const {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(cur);
@@ -151,7 +147,7 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                 int   il) {
+                     int   il) const {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(state_copy);
@@ -168,7 +164,7 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
            ggml_context * ctx0,
             ggml_tensor * token_shift,
      const llama_ubatch & ubatch,
-                 int   il) {
+                     int   il) const {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(token_shift);
    GGML_UNUSED(ubatch);
@@ -187,7 +183,7 @@ ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                 int   il) {
+                     int   il) const {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(cur);
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -93,6 +93,7 @@ public:
 //
 // TODO: can become more granular in the future
 // TODO: move all methods that do not require things from llama_context to llm_build_context
 class llama_graph_i {
 public:
    llama_graph_i(llama_graph_type type);
@@ -109,28 +110,28 @@ public:
             ggml_tensor * cur,
              const char * name,
      const llama_ubatch & ubatch,
-                     int   il) = 0;
+                     int   il) const = 0;
    // apply control vector for layer il
    virtual ggml_tensor * build_cvec(
            ggml_context * ctx0,
             ggml_tensor * cur,
-                     int   il) = 0;
+                     int   il) const = 0;
    // do mat_mul, while optionally apply lora
    virtual ggml_tensor * build_lora_mm(
            ggml_context * ctx0,
             ggml_tensor * w,
-             ggml_tensor * cur) = 0;
+             ggml_tensor * cur) const = 0;
    // do mat_mul_id, while optionally apply lora
    virtual ggml_tensor * build_lora_mm_id(
            ggml_context * ctx0,
             ggml_tensor * w,   // struct ggml_tensor * as
             ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids) = 0;
+             ggml_tensor * ids) const = 0;
-    virtual ggml_tensor * build_rope_factors(int il) = 0;
+    virtual ggml_tensor * build_rope_factors(int il) const = 0;
    // note: optionally set the backend to be the same as the bbuf's backend
    virtual ggml_tensor * build_rope_shift(
@@ -138,7 +139,7 @@ public:
             ggml_tensor * cur,
             ggml_tensor * shift,
             ggml_tensor * factors,
-             ggml_backend_buffer * bbuf) = 0;
+             ggml_backend_buffer * bbuf) const = 0;
    // graph build API (context-specific)
@@ -146,26 +147,31 @@ public:
      llama_graph_result * res,
            ggml_context * ctx0,
             ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them
+      const llama_ubatch & ubatch) const = 0;
    virtual ggml_tensor * build_inp_pos(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
+                 int32_t   n_tokens) const = 0;
    virtual ggml_tensor * build_inp_pos_bucket(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
+                 int32_t   n_tokens) const = 0;
    virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0) = 0;
+      llama_graph_result * res,
            ggml_context * ctx0) const = 0;
    virtual ggml_tensor * build_inp_mean(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
+                 int32_t   n_tokens) const = 0;
    virtual ggml_tensor * build_inp_cls(
      llama_graph_result * res,
            ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
+                 int32_t   n_tokens) const = 0;
    virtual llama_graph_input_attn_ptr build_attn_inp(
      llama_graph_result * res,
@@ -197,17 +203,16 @@ public:
                 int       il) const;
    virtual ggml_tensor * build_inp_cross_embd(
-            ggml_context * ctx0);
+      llama_graph_result * res,
-
+            ggml_context * ctx0) const;
    virtual ggml_tensor * build_inp_cross_kq_mask(
            ggml_context * ctx0,
                 int32_t   n_tokens);
    virtual ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0);
+      llama_graph_result * res,
            ggml_context * ctx0) const;
    virtual ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0);
+      llama_graph_result * res,
            ggml_context * ctx0) const;
    virtual ggml_tensor * build_copy_mask_state(
            ggml_context * ctx0,
@@ -216,7 +221,7 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
                 int32_t   n_state,
-                 int32_t   n_seqs);
+                 int32_t   n_seqs) const;
    virtual ggml_tensor * build_mamba_layer(
            ggml_context * ctx0,
@@ -225,7 +230,7 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) const;
    virtual ggml_tensor * build_rwkv_token_shift_load(
            ggml_context * ctx0,
@@ -233,13 +238,13 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) const;
    virtual ggml_tensor * build_rwkv_token_shift_store(
            ggml_context * ctx0,
             ggml_tensor * token_shift,
      const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) const;
    virtual ggml_tensor * build_rwkv6_time_mix(
            ggml_context * ctx0,
@@ -249,5 +254,5 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) const;
 };
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3910,7 +3910,7 @@ struct llm_build_context {
    // TODO: tmp
    struct ggml_tensor * build_inp_pos() {
-        ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens);
        cb(cur, "inp_pos", -1);
        return cur;
@@ -3918,7 +3918,7 @@ struct llm_build_context {
    // TODO: tmp
    struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0);
+        ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0);
        cb(cur, "inp_out_ids", -1);
        return cur;
@@ -3926,7 +3926,7 @@ struct llm_build_context {
    // TODO: tmp
    struct ggml_tensor * build_inp_mean() {
-        ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens);
        cb(cur, "inp_mean", -1);
        return cur;
@@ -3934,7 +3934,7 @@ struct llm_build_context {
    // TODO: tmp
    struct ggml_tensor * build_inp_cls() {
-        ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens);
        cb(cur, "inp_cls", -1);
        return cur;
@@ -3957,7 +3957,7 @@ struct llm_build_context {
    // TODO: tmp
    struct ggml_tensor * build_pos_bucket() {
-        ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens);
        cb(cur, "pos_bucket", -1);
        return cur;
@@ -3965,20 +3965,12 @@ struct llm_build_context {
    // TODO: tmp
    struct ggml_tensor * build_inp_cross_embd() {
-        ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0);
+        ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0);
        cb(cur, "embd_enc", -1);
        return cur;
    }
    // TODO: tmp
    struct ggml_tensor * build_inp_cross_kq_mask() {
        ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens);
        cb(cur, "KQ_mask_cross", -1);
        return cur;
    }
    struct ggml_tensor * build_norm(
             struct ggml_tensor * cur,
             struct ggml_tensor * mw,
@@ -3987,7 +3979,7 @@ struct llm_build_context {
                            int   il) {
        switch (type) {
            case LLM_NORM:       cur = ggml_norm    (ctx0, cur, hparams.f_norm_eps);     break;
-            case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx0, cur, hparams.f_norm_rms_eps); break;
+            case LLM_NORM_RMS:   cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break;
            case LLM_NORM_GROUP:
                {
                    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
@@ -8070,8 +8062,8 @@ struct llm_build_context {
        // {n_embd, n_tokens}
        inpL = build_inp_embd(model.tok_embd);
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
        for (int il = 0; il < n_layer; ++il) {
            // norm
@@ -10443,8 +10435,8 @@ struct llm_build_context {
        inpL = build_inp_embd(model.tok_embd);
        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
        const auto n_embd = hparams.n_embd;
        const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10535,8 +10527,8 @@ struct llm_build_context {
        inpL = build_inp_embd(model.tok_embd);
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
        const auto n_embd = hparams.n_embd;
        const auto n_seq_tokens = ubatch.n_seq_tokens;