llama: consistent ctx <-> buf order for KV cache (#16746)

This commit is contained in:
Johannes Gäßler
2025-10-28 11:23:54 +01:00
committed by GitHub
parent 280d97be96
commit 7a0e900e36
5 changed files with 41 additions and 33 deletions

View File

@@ -217,8 +217,8 @@ private:
// this is the SWA type of the cache - not to be confused with the model SWA type
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
// ggml contexts for the KV cache along with the allocated backend buffers:
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method