llama: consistent ctx <-> buf order for KV cache (#16746)

2025-11-12 10:47:01 +00:00 · 2025-10-28 11:23:54 +01:00
parent 280d97be96
commit 7a0e900e36
5 changed files with 41 additions and 33 deletions
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -217,8 +217,8 @@ private:
    // this is the SWA type of the cache - not to be confused with the model SWA type
    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;

-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
+    // ggml contexts for the KV cache along with the allocated backend buffers:
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;

    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method