mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-12 10:47:01 +00:00
ggml: add ggml_can_fuse_subgraph (#16662)
* ggml: add ggml_can_fuse_subgraph * ggml-cuda: use ggml_can_fuse_subgraph for topk-moe * format * 1. remove inputs from signature as they are transient nodes 2. add check for views: view_src should be part of the subgraph * - combine check into one loop - check all view_src parents - other minor review comments * remove redudant if test * - rename and other minor review comments * add assert about count < 32
This commit is contained in:
@@ -2821,15 +2821,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops = ggml_cuda_topk_moe_ops(false);
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm = ggml_cuda_topk_moe_ops(true);
|
||||
|
||||
if (ops.size() == topk_moe_ops_with_norm.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops_with_norm.begin())) {
|
||||
|
||||
if (node_idx + topk_moe_ops_with_norm.size() > (size_t)cgraph->n_nodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < topk_moe_ops_with_norm.size(); i++) {
|
||||
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops_with_norm.begin()[i]) return false;
|
||||
}
|
||||
if (ops.size() == topk_moe_ops_with_norm.size() &&
|
||||
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx+8];
|
||||
|
||||
@@ -2838,16 +2831,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
}
|
||||
}
|
||||
|
||||
if (ops.size() == topk_moe_ops.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops.begin())) {
|
||||
|
||||
if (node_idx + topk_moe_ops.size() > (size_t)cgraph->n_nodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < topk_moe_ops.size(); i++) {
|
||||
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops.begin()[i]) return false;
|
||||
}
|
||||
|
||||
if (ops.size() == topk_moe_ops.size() &&
|
||||
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx+4];
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||
|
||||
Reference in New Issue
Block a user