kv-cache : better estimate of n_kv for multi-sequence batches (#15610)

ggml-ci
2025-10-28 08:31:25 +00:00 · 2025-08-27 13:55:12 +03:00
parent 1e7489745a
commit 1bded5a3b3
2 changed files with 15 additions and 16 deletions
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -771,8 +771,8 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
            GGML_ASSERT(ubatch.seq_id  [s*n_tokens][0] == seq_id);
        }

-        res.s0 = std::min<llama_seq_id>(res.s0, seq_to_stream[seq_id]);
-        res.s1 = std::max<llama_seq_id>(res.s1, seq_to_stream[seq_id]);
+        res.s0 = std::min<uint32_t>(res.s0, seq_to_stream[seq_id]);
+        res.s1 = std::max<uint32_t>(res.s1, seq_to_stream[seq_id]);

        res.strm[s] = seq_to_stream[seq_id];
        res.idxs[s].reserve(n_tokens);
@@ -964,11 +964,11 @@ bool llama_kv_cache::get_has_shift() const {
    return result;
 }

-uint32_t llama_kv_cache::get_n_kv() const {
+uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
    uint32_t result = 0;

-    for (uint32_t s = 0; s < n_stream; ++s) {
-        const auto & cells = v_cells[s];
+    for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
+        const auto & cells = v_cells[sinfo.strm[s]];

        result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
    }
@@ -1985,8 +1985,7 @@ bool llama_kv_cache_context::apply() {
    }

    kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
-
-    n_kv = kv->get_n_kv();
+    n_kv = kv->get_n_kv(sinfos[i_cur]);

    return true;
 }
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -38,8 +38,8 @@ public:
        using idx_vec_t = std::vector<uint32_t>;

        // number of streams: ns = s1 - s0 + 1
-        llama_seq_id s0;
-        llama_seq_id s1;
+        uint32_t s0;
+        uint32_t s1;

        std::vector<llama_seq_id> strm; // [ns]
        std::vector<idx_vec_t>    idxs; // [ns]
@@ -139,7 +139,7 @@ public:
    // graph_build API
    //

-    uint32_t get_n_kv() const;
+    uint32_t get_n_kv(const slot_info & sinfo) const;

    // TODO: temporary
    bool get_supports_set_rows() const;