mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-06 09:46:50 +00:00
cont : restore padding for n_kv tensor shape
This commit is contained in:
@@ -957,10 +957,14 @@ bool llama_kv_cache::get_has_shift() const {
|
|||||||
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
||||||
uint32_t result = 0;
|
uint32_t result = 0;
|
||||||
|
|
||||||
|
// pad the n_kv value so that the graph remains constant across batches and can be reused
|
||||||
|
// note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220)
|
||||||
|
const uint32_t n_pad_cur = std::max(n_pad, 256u);
|
||||||
|
|
||||||
for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
|
for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
|
||||||
const auto & cells = v_cells[sinfo.strm[s]];
|
const auto & cells = v_cells[sinfo.strm[s]];
|
||||||
|
|
||||||
result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
|
result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|||||||
Reference in New Issue
Block a user