memory : remove KV cache size padding (#16812)

* memory : remove KV cache size padding * cont : restore padding for n_kv tensor shape * server : use slot context size instead of training context size * server : simplify context limit logic
2025-11-16 11:27:03 +00:00 · 2025-10-28 20:19:44 +02:00
parent a8ca18b4b8
commit 85a7d8677b
6 changed files with 14 additions and 54 deletions
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -500,9 +500,8 @@ struct llama_model {

    ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;

-    // note: can mutate `cparams`
    // TODO: move this to new llm_arch_model_i interface
-    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
+    llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;

    // TODO: move this to new llm_arch_model_i interface
    ggml_cgraph * build_graph(const llm_graph_params & params) const;