mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-11 10:36:54 +00:00
kv-cache : pad the cache size to 256 for performance (#17046)
* kv-cache : pad the size of the small SWA cache for performance * context : pad the total context to 256 * cont : future-proof the swa pad * server : adjust test params to new logic
This commit is contained in:
@@ -114,10 +114,14 @@ llama_context::llama_context(
|
||||
}
|
||||
}
|
||||
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
||||
|
||||
if (cparams.kv_unified) {
|
||||
cparams.n_ctx_seq = cparams.n_ctx;
|
||||
} else {
|
||||
cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
|
||||
|
||||
if (cparams.n_ctx_seq == 0) {
|
||||
throw std::runtime_error("n_ctx_seq == 0");
|
||||
|
||||
Reference in New Issue
Block a user