CANN: implement LRU cache for ACL graphs (#15814)

* CANN: implement LRU cache for ACL graphs in CANN backend

- Introduce ggml_cann_graph_lru_cache to store multiple ggml_cann_graph objects.
- Graphs are loaded on demand and evicted using LRU policy when capacity is exceeded.
- Updated push, move_to_front, and clear methods to manage cached graphs efficiently.
- Ensures reuse of graphs, reducing graph reconstruction overhead in CANN backend.

* fix typo

* The LRU cache capacity can be configured via an env variable

Signed-off-by: noemotiovon <757486878@qq.com>

* refactory acl graph

* refactory && fix review comments

Signed-off-by: noemotiovon <757486878@qq.com>

---------

Signed-off-by: noemotiovon <757486878@qq.com>
This commit is contained in:
Chenguang Li
2025-09-10 15:29:12 +08:00
committed by GitHub
parent 86587da03b
commit 28b5f190ef
3 changed files with 164 additions and 51 deletions

View File

@@ -314,3 +314,7 @@ Converting the matmul weight format from ND to NZ to improve performance. Enable
### GGML_CANN_ACL_GRAPH ### GGML_CANN_ACL_GRAPH
Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default. Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
### GGML_CANN_GRAPH_CACHE_CAPACITY
Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted.

View File

@@ -38,6 +38,7 @@
#include <unistd.h> #include <unistd.h>
#include <functional> #include <functional>
#include <optional> #include <optional>
#include <list>
#include "../include/ggml-cann.h" #include "../include/ggml-cann.h"
#include "../include/ggml.h" #include "../include/ggml.h"
@@ -106,6 +107,7 @@ int32_t ggml_cann_get_device();
std::optional<std::string> get_env(const std::string& name); std::optional<std::string> get_env(const std::string& name);
bool parse_bool(const std::string& value); bool parse_bool(const std::string& value);
int parse_integer(const std::string& value);
/** /**
* @brief Abstract base class for memory pools used by CANN. * @brief Abstract base class for memory pools used by CANN.
@@ -350,7 +352,7 @@ struct ggml_graph_node_properties {
struct ggml_cann_graph { struct ggml_cann_graph {
~ggml_cann_graph() { ~ggml_cann_graph() {
if (graph != nullptr) { if (graph != nullptr) {
aclmdlRIDestroy(graph); ACL_CHECK(aclmdlRIDestroy(graph));
} }
} }
@@ -358,6 +360,64 @@ struct ggml_cann_graph {
std::vector<ggml_graph_node_properties> ggml_graph_properties; std::vector<ggml_graph_node_properties> ggml_graph_properties;
}; };
/**
* @brief LRU cache for managing ggml_cann_graph objects.
*
* This class maintains a list of shared_ptr to ggml_cann_graph objects
* and enforces a maximum capacity. It provides methods to push new graphs,
* move existing graphs to the front (most recently used), and clear the cache.
*/
struct ggml_cann_graph_lru_cache {
size_t capacity; /**< Maximum number of graphs in the cache. */
std::list<ggml_cann_graph*> cache_list; /**< List storing cached graphs as raw pointers. */
ggml_cann_graph_lru_cache() {
capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12"));
}
/**
* @brief Push a new graph to the front of the cache.
* If the cache exceeds capacity, the least recently used graph is deleted.
* @param new_node Pointer to the new ggml_cann_graph to cache.
* Ownership is transferred to the cache (cache will delete it).
*/
void push(ggml_cann_graph* new_node) {
if (cache_list.size() >= capacity) {
ggml_cann_graph* old = cache_list.back();
cache_list.pop_back();
delete old; // free the old graph
}
cache_list.push_front(new_node);
}
/**
* @brief Move an existing graph to the front of the cache.
* @param node Pointer to the ggml_cann_graph to move.
*/
void move_to_front(ggml_cann_graph* node) {
cache_list.remove(node);
cache_list.push_front(node);
}
/**
* @brief Clear all graphs from the cache (also frees memory).
*/
void clear() {
for (auto ptr : cache_list) {
delete ptr;
}
cache_list.clear();
}
/**
* @brief Destructor that clears the cache and frees all cached graphs.
*/
~ggml_cann_graph_lru_cache() {
clear();
}
};
#endif // USE_ACL_GRAPH #endif // USE_ACL_GRAPH
struct ggml_cann_rope_cache { struct ggml_cann_rope_cache {
@@ -394,7 +454,7 @@ struct ggml_backend_cann_context {
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
#ifdef USE_ACL_GRAPH #ifdef USE_ACL_GRAPH
/// Cached CANN ACL graph used for executing the current ggml computation graph. /// Cached CANN ACL graph used for executing the current ggml computation graph.
std::unique_ptr<ggml_cann_graph> cann_graph; ggml_cann_graph_lru_cache graph_lru_cache;
bool acl_graph_mode = true; bool acl_graph_mode = true;
#endif #endif
cann_task_queue task_queue; cann_task_queue task_queue;

View File

@@ -116,6 +116,24 @@ bool parse_bool(const std::string& value) {
return valid_values.find(value) != valid_values.end(); return valid_values.find(value) != valid_values.end();
} }
/**
* @brief Parse a string as an integer, returning 0 if invalid.
*
* This function attempts to convert the input string `value` to an `int`.
* If the string is not a valid integer or is out of the `int` range,
* it returns 0.
*
* @param value The string to parse.
* @return The parsed integer, or 0 if conversion fails.
*/
int parse_integer(const std::string& value) {
try {
return std::stoi(value);
} catch (...) {
return 0;
}
}
/** /**
* @brief Initialize the CANN device information. * @brief Initialize the CANN device information.
* *
@@ -2131,30 +2149,52 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
#ifdef USE_ACL_GRAPH #ifdef USE_ACL_GRAPH
/** /**
* @brief Populate the internal CANN graph node properties from the ggml computation graph. * @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
* *
* This function copies all node attributes (operation type, dimensions, strides, input sources, * This function creates a new ggml_cann_graph object and fills its node properties
* and operation parameters) into the cached CANN graph structure for later reuse or comparison. * (operation type, dimensions, strides, input sources, and operation parameters)
* based on the current ggml computation graph.
* *
* @param cann_ctx The CANN backend context. * Each node in the ggml graph is mapped to a property entry in the new CANN graph:
* @param cgraph The ggml computational graph. * - node address
* - operation type
* - shape (ne) and strides (nb)
* - source tensor addresses
* - operation parameters
*
* After initialization, the new graph is pushed into the LRU cache owned by the
* CANN backend context. The cache takes ownership of the graph and manages its
* lifetime (including deletion upon eviction).
*
* @param cann_ctx The CANN backend context containing the graph cache.
* @param cgraph The current ggml computation graph.
*/ */
static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { static void add_lru_matched_graph_node_properties(
for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) { ggml_backend_cann_context * cann_ctx,
ggml_tensor * node = cgraph->nodes[node_idx]; ggml_cgraph * cgraph) {
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data; // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op; ggml_cann_graph * new_graph = new ggml_cann_graph();
new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) { for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim]; ggml_tensor * node = cgraph->nodes[node_idx];
cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim]; auto & prop = new_graph->ggml_graph_properties[node_idx];
prop.node_address = node->data;
prop.node_op = node->op;
std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
for (int src = 0; src < GGML_MAX_SRC; ++src) {
prop.src_address[src] = node->src[src] ? node->src[src]->data : nullptr;
} }
for (int src = 0; src < GGML_MAX_SRC; src++) {
cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] = memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
node->src[src] ? node->src[src]->data : nullptr;
}
memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
} }
// Insert into the LRU cache (cache takes ownership and will delete it when evicted).
cann_ctx->graph_lru_cache.push(new_graph);
} }
/** /**
@@ -2199,30 +2239,45 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
} }
/** /**
* @brief Determine if the CANN graph needs to be rebuilt due to graph changes. * @brief Check whether there is a cached CANN graph that matches the current ggml graph.
* *
* This checks whether the number or properties of ggml graph nodes have changed * This function iterates through the cached CANN graphs stored in the LRU cache and
* compared to the last captured CANN graph. If so, the CANN graph must be re-captured. * compares them against the given ggml computation graph. A match requires that the
* number of nodes is the same and that each nodes properties (operation type,
* dimensions, strides, inputs, and operation parameters) are identical.
* *
* @param cann_ctx The CANN backend context. * If a matching graph is found, it is promoted to the front of the LRU cache and the
* function returns true. Otherwise, the function returns false, indicating that a new
* CANN graph needs to be captured.
*
* @param cann_ctx The CANN backend context containing the graph cache.
* @param cgraph The current ggml computation graph. * @param cgraph The current ggml computation graph.
* @return true if an update is required; false otherwise. * @return true if a matching cached graph exists; false otherwise.
*/ */
static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
// The number of nodes is different, so the graph needs to be reconstructed. ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache;
if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { for (auto &graph_ptr : lru_cache.cache_list) {
cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes); // Skip graphs with a different number of nodes.
return true; if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
} continue;
}
// The number of nodes is the same; iterate over each node to check whether they match. // Check if all nodes match.
for (int i = 0; i < cgraph->n_nodes; i++) { bool all_match = true;
bool has_matching_properties = ggml_graph_node_has_matching_properties( for (int i = 0; i < cgraph->n_nodes; ++i) {
cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]); if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
if(!has_matching_properties) { all_match = false;
break;
}
}
if (all_match) {
// update cache_list && renturn graph_ptr
lru_cache.move_to_front(graph_ptr);
return true; return true;
} }
} }
return false; return false;
} }
#endif // USE_ACL_GRAPH #endif // USE_ACL_GRAPH
@@ -2241,17 +2296,13 @@ static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx,
* @param cann_graph_update_required Whether graph capture is needed due to graph changes. * @param cann_graph_update_required Whether graph capture is needed due to graph changes.
*/ */
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph, static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
bool & use_cann_graph, bool & cann_graph_update_required) { bool & use_cann_graph, bool & cann_graph_update_required) {
#ifdef USE_ACL_GRAPH #ifdef USE_ACL_GRAPH
ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
if (use_cann_graph && cann_graph_update_required) { if (use_cann_graph && cann_graph_update_required) {
if (cann_ctx->cann_graph->graph != nullptr) {
ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
cann_ctx->cann_graph->graph = nullptr;
}
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL)); ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
} }
#endif // USE_ACL_GRAPH #endif // USE_ACL_GRAPH
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph. // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
// With the use of CANN graphs, the execution will be performed by the graph launch. // With the use of CANN graphs, the execution will be performed by the graph launch.
if (!use_cann_graph || cann_graph_update_required) { if (!use_cann_graph || cann_graph_update_required) {
@@ -2272,12 +2323,12 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
#ifdef USE_ACL_GRAPH #ifdef USE_ACL_GRAPH
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph)); ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
} }
if (use_cann_graph) { if (use_cann_graph) {
// Execute graph // Execute graph
ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream())); ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
} }
#endif // USE_ACL_GRAPH #endif // USE_ACL_GRAPH
} }
@@ -2311,19 +2362,17 @@ static enum ggml_status ggml_backend_cann_graph_compute(
} }
if (use_cann_graph) { if (use_cann_graph) {
if (cann_ctx->cann_graph == nullptr) { // If no matching graph is found, the graph needs to be recaptured.
cann_ctx->cann_graph.reset(new ggml_cann_graph()); cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
cann_graph_update_required = true; if (cann_graph_update_required) {
// If no matching graph is found, add a new ACL graph.
add_lru_matched_graph_node_properties(cann_ctx, cgraph);
} }
cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
set_ggml_graph_node_properties(cann_ctx, cgraph);
} }
#else #else
bool use_cann_graph = false; bool use_cann_graph = false;
bool cann_graph_update_required = false; bool cann_graph_update_required = false;
#endif // USE_ACL_GRAPH #endif // USE_ACL_GRAPH
evaluate_and_capture_cann_graph( evaluate_and_capture_cann_graph(
cann_ctx, cann_ctx,
cgraph, cgraph,