llama : fix defrag bugs + add parameter (#5735)

* llama : fix defrag bugs + enable by default ggml-ci * llama : add defrag_thold parameter ggml-ci * llama : cont * llama : disable log message ggml-ci * llama : fix graph size check during defrag
2025-11-01 09:01:57 +00:00 · 2024-02-27 14:35:51 +02:00
parent cbbd1efa06
commit 9d533a77d0
5 changed files with 82 additions and 30 deletions
--- a/llama.h
+++ b/llama.h
@@ -245,6 +245,7 @@ extern "C" {
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size
+        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)

        ggml_backend_sched_eval_callback cb_eval;
        void * cb_eval_user_data;