kv-cache : remove LLAMA_SET_ROWS checks (#15505)

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-08-28 12:27:02 +03:00
committed by GitHub
parent 64387f6e95
commit 8a4280ce43
7 changed files with 23 additions and 141 deletions

View File

@@ -102,16 +102,6 @@ llama_context::llama_context(
cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
{
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
if (!supports_set_rows && !cparams.kv_unified) {
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
cparams.kv_unified = true;
}
}
{
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
@@ -890,12 +880,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
}
}
if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}
// TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) {
//cross.t_embd = t_embd;
@@ -1226,12 +1210,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
// wait for the computation to finish (automatically done when obtaining the model output)
//synchronize();
if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}
return 0;
}