server : add SWA checkpoints (#15293)

* server : add SWA checkpoints

ggml-ci

* cont : server clean-up

* server : handle state restore fails

* llama : add extended llama_state_seq_ API

* server : do not make checkpoints if --swa-full

ggml-ci

* llama : remove flags value for NONE

* server : configure number of SWA checkpoints with CLI arg

ggml-ci

* args : fix scope of new argument
This commit is contained in:
Georgi Gerganov
2025-08-14 14:59:50 +03:00
committed by GitHub
parent 3973163bff
commit d32e03f449
15 changed files with 206 additions and 54 deletions

View File

@@ -1507,6 +1507,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.swa_full = true;
}
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--swa-checkpoints"}, "N",
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
[](common_params & params, int value) {
params.n_swa_checkpoints = value;
}
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"