mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-16 11:27:03 +00:00
memory : remove KV cache size padding (#16812)
* memory : remove KV cache size padding * cont : restore padding for n_kv tensor shape * server : use slot context size instead of training context size * server : simplify context limit logic
This commit is contained in:
@@ -500,9 +500,8 @@ struct llama_model {
|
||||
|
||||
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
|
||||
|
||||
// note: can mutate `cparams`
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
||||
llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
|
||||
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|
||||
|
||||
Reference in New Issue
Block a user