context : rename to llama_context_kv_self

2025-11-08 10:07:01 +00:00 · 2025-02-12 17:16:44 +02:00
parent 6ee86e5e0f
commit fbe6a07256
5 changed files with 102 additions and 98 deletions
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -82,6 +82,8 @@ struct llama_context : public llama_graph_i {
                int32_t   il_start,
                int32_t   il_end);

+    // graph build API (generic)
+
    virtual void build_cb(
             ggml_tensor * cur,
              const char * name,
@@ -91,6 +93,27 @@ struct llama_context : public llama_graph_i {
    // TODO: add encode/decode graphs
    virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case);

+    // apply control vector for layer il
+    virtual ggml_tensor * build_cvec(
+            ggml_context * ctx0,
+             ggml_tensor * cur,
+                     int   il);
+
+    // do mat_mul, while optionally apply lora
+    virtual ggml_tensor * build_lora_mm(
+            ggml_context * ctx0,
+             ggml_tensor * w,
+             ggml_tensor * cur);
+
+    // do mat_mul_id, while optionally apply lora
+    virtual ggml_tensor * build_lora_mm_id(
+            ggml_context * ctx0,
+             ggml_tensor * w,   // struct ggml_tensor * as
+             ggml_tensor * cur, // struct ggml_tensor * b
+             ggml_tensor * ids);
+
+    virtual ggml_tensor * build_rope_factors(int il);
+
    // decode a batch of tokens by evaluating the transformer
    // in case of unsuccessful decoding (error or warning),
    // the kv_cache state will be returned to its original state
@@ -116,29 +139,6 @@ struct llama_context : public llama_graph_i {
    //
    virtual int encode(llama_batch & inp_batch) = 0;

-    // graph build API (generic)
-
-    // apply control vector for layer il
-    virtual ggml_tensor * build_cvec(
-            ggml_context * ctx0,
-             ggml_tensor * cur,
-                     int   il);
-
-    // do mat_mul, while optionally apply lora
-    virtual ggml_tensor * build_lora_mm(
-            ggml_context * ctx0,
-             ggml_tensor * w,
-             ggml_tensor * cur);
-
-    // do mat_mul_id, while optionally apply lora
-    virtual ggml_tensor * build_lora_mm_id(
-            ggml_context * ctx0,
-             ggml_tensor * w,   // struct ggml_tensor * as
-             ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids);
-
-    virtual ggml_tensor * build_rope_factors(int il);
-
    // state save/load

    virtual size_t state_get_size()                                 = 0;
@@ -217,16 +217,16 @@ protected:
    mutable int32_t n_eval   = 0; // number of eval calls
 };

-// TODO: make implementation details private
-class llama_context_unified : public llama_context {
+// transformer with a self-attention KV cache
+class llama_context_kv_self : public llama_context {
 public:
    struct batch_manager;

-    llama_context_unified(
+    llama_context_kv_self(
            const llama_model & model,
            const llama_context_params & params);

-    virtual ~llama_context_unified();
+    virtual ~llama_context_kv_self();

    virtual uint32_t n_seq_max() const override;