From c0389dba43d50695f9d3f57dd1f1a14cbefc100c Mon Sep 17 00:00:00 2001 From: hipudding Date: Thu, 11 Sep 2025 15:59:37 +0800 Subject: [PATCH] CANN: Disable acl_graph for prefill stage (#15933) Since the prefill length is not fixed, graphs constructed for the prefill stage cannot be reused. For this reason, ACL graph execution is disabled by default during prefill. --- docs/backend/CANN.md | 4 ++++ ggml/src/ggml-cann/ggml-cann.cpp | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 35b189bb95..e45fc7dd28 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -318,3 +318,7 @@ Operators are executed using ACL graph execution, rather than in op-by-op (eager ### GGML_CANN_GRAPH_CACHE_CAPACITY Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted. + +### GGML_CANN_PREFILL_USE_GRAPH + +Enable ACL graph execution during the prefill stage, default is false. This option is only effective when FA is enabled. diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index d148174f1e..19a18a281d 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2360,6 +2360,21 @@ static enum ggml_status ggml_backend_cann_graph_compute( bool use_cann_graph = true; bool cann_graph_update_required = false; + static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or("")); + if (!prefill_use_graph) { + // Do not use acl_graph for prefill. + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + // TODO: Optimize here. Currently, we can only + // get seq_len by FA's input. + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + // Q -> src[0], shape: [B, S, N, D] + use_cann_graph = (node->src[0]->ne[1] == 1); + break; + } + } + } + if (!cann_ctx->acl_graph_mode) { use_cann_graph = false; }