From e776168267d1e5e9451f5325d56cd213fde37e76 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 28 Oct 2025 11:09:59 +0200 Subject: [PATCH] cont : restore padding for n_kv tensor shape --- src/llama-kv-cache.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 0b97d8a344..c017d15464 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -957,10 +957,14 @@ bool llama_kv_cache::get_has_shift() const { uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const { uint32_t result = 0; + // pad the n_kv value so that the graph remains constant across batches and can be reused + // note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220) + const uint32_t n_pad_cur = std::max(n_pad, 256u); + for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { const auto & cells = v_cells[sinfo.strm[s]]; - result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result); + result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result); } return result;