mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
server : context checkpointing for hybrid and recurrent models (#16382)
* initial commit for branch 3 * generalize `swa_checkpoint` to `ctx_checkpoint` this extends `llama-server`'s SWA checkpointing logic to include hybrid/recurrent models such as Jamba, Granite * oops * disable debug prints * keep backwards compat with `--swa-checkpoints` Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update prompt re-processing message * fix off-by-one error per GG * keep `seq_rm` log per GG Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server : fix checkpoint logic to support recurrent caches * server : cleanup and fixes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@@ -543,6 +543,9 @@ extern "C" {
|
||||
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
||||
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
||||
|
||||
// Returns true if the model is hybrid (like Jamba, Granite, etc.)
|
||||
LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
|
||||
|
||||
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
||||
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
||||
|
||||
@@ -791,8 +794,12 @@ extern "C" {
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
|
||||
// for backwards-compat
|
||||
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
||||
|
||||
// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
|
||||
#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
|
||||
|
||||
typedef uint32_t llama_state_seq_flags;
|
||||
|
||||
LLAMA_API size_t llama_state_seq_get_size_ext(
|
||||
|
||||
Reference in New Issue
Block a user