llama : separate compute buffer reserve from fattn check (#15696)

Exposes ggml_backend_sched_split_graph() to allow splitting the graph without allocating compute buffers and uses it to split the graph for the automatic Flash Attention check.
2025-11-07 09:57:00 +00:00 · 2025-08-31 06:49:03 -07:00
parent 7d3c9f2b21
commit 9777032dcc
4 changed files with 64 additions and 58 deletions
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -196,7 +196,7 @@ public:
    ggml_status graph_compute(ggml_cgraph * gf, bool batched);

    // reserve a graph with a dummy ubatch of the specified size
-    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
+    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);

 private:
    llm_graph_params graph_params(