memory : remove KV cache size padding (#16812)

* memory : remove KV cache size padding

* cont : restore padding for n_kv tensor shape

* server : use slot context size instead of training context size

* server : simplify context limit logic
This commit is contained in:
Georgi Gerganov
2025-10-28 20:19:44 +02:00
committed by GitHub
parent a8ca18b4b8
commit 85a7d8677b
6 changed files with 14 additions and 54 deletions

View File

@@ -500,9 +500,8 @@ struct llama_model {
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
// note: can mutate `cparams`
// TODO: move this to new llm_arch_model_i interface
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
// TODO: move this to new llm_arch_model_i interface
ggml_cgraph * build_graph(const llm_graph_params & params) const;