context : add cache-less llama_context

ggml-ci
2025-11-08 10:07:01 +00:00 · 2025-02-20 15:18:45 +02:00
parent 072280ea6b
commit b1554be1d7
8 changed files with 1073 additions and 355 deletions
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -99,34 +99,29 @@ public:
                 int32_t   n_tokens,
                 float     kq_scale,
                 int       il,
-                 bool      worst_case) = 0;
-
-    virtual ggml_tensor * build_attn_soft_max(
-            ggml_context * ctx0,
-             ggml_tensor * kq,
-                 float     kq_scale) = 0;
+                 bool      worst_case);

    virtual void build_kv_self_shift(
            ggml_context * ctx0,
-            ggml_cgraph * gf) = 0;
+            ggml_cgraph * gf);

    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
    virtual void build_kv_self_defrag(
            ggml_context * ctx0,
-            ggml_cgraph * gf) = 0;
+            ggml_cgraph * gf);

-    virtual ggml_tensor * build_inp_k_shift(
-            ggml_context * ctx0) = 0;
+    virtual ggml_tensor * build_inp_self_k_shift(
+            ggml_context * ctx0);

    virtual ggml_tensor * build_inp_embd_enc(
            ggml_context * ctx0,
                 int32_t   n_tokens,
-                    bool   worst_case) = 0;
+                    bool   worst_case);

-    virtual ggml_tensor * build_inp_KQ_mask_cross(
+    virtual ggml_tensor * build_inp_kq_mask_cross(
            ggml_context * ctx0,
                 int32_t   n_tokens,
-                    bool   worst_case) = 0;
+                    bool   worst_case);

    virtual ggml_tensor * build_inp_s_copy(
            ggml_context * ctx0,