llama : reuse compute graphs

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-07-01 15:59:43 +03:00
parent bac8bed248
commit 76681e3c73
17 changed files with 458 additions and 187 deletions

View File

@@ -374,6 +374,8 @@ extern "C" {
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
bool graph_reuse; // reuse previous compute graphs when possible
};
// model quantization parameters
@@ -1429,6 +1431,7 @@ extern "C" {
int32_t n_p_eval;
int32_t n_eval;
int32_t n_reused;
};
struct llama_perf_sampler_data {