graph : remove the build_kv_... API from llama_graph_i

ggml-ci
2025-11-08 10:07:01 +00:00 · 2025-02-23 19:39:22 +02:00
parent 372fa3a894
commit 6378112cb5
4 changed files with 50 additions and 43 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1842,6 +1842,25 @@ ggml_tensor * llama_context::build_attn(
    return cur;
 }
 void llama_context::build_kv_self_shift(
        ggml_context * ctx0,
        ggml_cgraph * gf) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 }
 void llama_context::build_kv_self_defrag(
        ggml_context * ctx0,
        ggml_cgraph * gf) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 }
 //
 // perf
 //
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -171,7 +171,7 @@ protected:
    // graph
    //
-    // zero-out inputs and create the ctx_context for the compute graph
+    // zero-out inputs and create the ctx_compute for the compute graph
    virtual ggml_cgraph * graph_init();
    // TODO: add encode/decode graphs
@@ -187,73 +187,74 @@ protected:
    ggml_context_ptr ctx_compute;
 public:
    //
-    // graph build API (generic)
+    // graph build
    //
    virtual void build_cb(
             ggml_tensor * cur,
              const char * name,
      const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) override;
    // apply control vector for layer il
    virtual ggml_tensor * build_cvec(
            ggml_context * ctx0,
             ggml_tensor * cur,
-                     int   il);
+                     int   il) override;
    // do mat_mul, while optionally apply lora
    virtual ggml_tensor * build_lora_mm(
            ggml_context * ctx0,
             ggml_tensor * w,
-             ggml_tensor * cur);
+             ggml_tensor * cur) override;
    // do mat_mul_id, while optionally apply lora
    virtual ggml_tensor * build_lora_mm_id(
            ggml_context * ctx0,
             ggml_tensor * w,   // struct ggml_tensor * as
             ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids);
+             ggml_tensor * ids) override;
-    virtual ggml_tensor * build_rope_factors(int il);
+    virtual ggml_tensor * build_rope_factors(int il) override;
    virtual ggml_tensor * build_rope_shift(
            ggml_context * ctx0,
             ggml_tensor * cur,
             ggml_tensor * shift,
             ggml_tensor * factors,
-             ggml_backend_buffer * bbuf);
+             ggml_backend_buffer * bbuf) override;
    virtual ggml_tensor * build_inp_embd(
            ggml_context * ctx0,
             ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch);
+      const llama_ubatch & ubatch) override;
    virtual ggml_tensor * build_inp_pos(
            ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
    virtual ggml_tensor * build_inp_pos_bucket(
            ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
    virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0);
+            ggml_context * ctx0) override;
    virtual ggml_tensor * build_inp_mean(
            ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
    virtual ggml_tensor * build_inp_cls(
            ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
    virtual void build_attn_inp(
            ggml_context * ctx0,
                 int32_t   n_tokens,
                    bool   causal,
-                    bool   swa);
+                    bool   swa) override;
    virtual ggml_tensor * build_attn(
            ggml_context * ctx0,
@@ -266,7 +267,17 @@ protected:
             ggml_tensor * kq_b,
                 int32_t   n_tokens,
                 float     kq_scale,
-                 int       il);
+                 int       il) override;
 protected:
    virtual void build_kv_self_shift(
            ggml_context * ctx0,
            ggml_cgraph * gf);
    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
    virtual void build_kv_self_defrag(
            ggml_context * ctx0,
            ggml_cgraph * gf);
 public:
    //
@@ -434,6 +445,7 @@ protected:
    virtual ggml_cgraph * graph_init() override;
 public:
    //
    // graph build
    //
@@ -463,6 +475,7 @@ protected:
                 float     kq_scale,
                 int       il) override;
 protected:
    virtual void build_kv_self_shift(
            ggml_context * ctx0,
            ggml_cgraph * gf) override;
@@ -548,6 +561,7 @@ protected:
    virtual ggml_cgraph * graph_init() override;
 public:
    //
    // graph build
    //
@@ -600,6 +614,7 @@ protected:
      const llama_ubatch & ubatch,
                     int   il) override;
 protected:
    //
    // state save/load
    //
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -32,24 +32,6 @@ ggml_tensor * llama_graph_i::build_attn(
    return nullptr;
 }
 void llama_graph_i::build_kv_self_shift(
        ggml_context * ctx0,
        ggml_cgraph * gf) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 }
 void llama_graph_i::build_kv_self_defrag(
        ggml_context * ctx0,
        ggml_cgraph * gf) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 }
 ggml_tensor * llama_graph_i::build_inp_self_k_shift(
        ggml_context * ctx0) {
    GGML_UNUSED(ctx0);
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -117,15 +117,6 @@ public:
                 float     kq_scale,
                 int       il);
    virtual void build_kv_self_shift(
            ggml_context * ctx0,
            ggml_cgraph * gf);
    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
    virtual void build_kv_self_defrag(
            ggml_context * ctx0,
            ggml_cgraph * gf);
    virtual ggml_tensor * build_inp_self_k_shift(
            ggml_context * ctx0);