llama : reuse compute graphs

ggml-ci
2025-11-02 09:12:03 +00:00 · 2025-07-01 15:59:43 +03:00
parent bac8bed248
commit 76681e3c73
17 changed files with 458 additions and 187 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -374,6 +374,8 @@ extern "C" {
        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+
+        bool graph_reuse; // reuse previous compute graphs when possible
    };

    // model quantization parameters
@@ -1429,6 +1431,7 @@ extern "C" {

        int32_t n_p_eval;
        int32_t n_eval;
+        int32_t n_reused;
    };

    struct llama_perf_sampler_data {