mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
ggml-cuda: use passed ops instead of hardcoded ops (#16712)
This commit is contained in:
@@ -2826,7 +2826,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|||||||
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
|
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
|
||||||
|
|
||||||
if (ops.size() == topk_moe_ops_with_norm.size() &&
|
if (ops.size() == topk_moe_ops_with_norm.size() &&
|
||||||
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
|
||||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||||
ggml_tensor * weights = cgraph->nodes[node_idx+8];
|
ggml_tensor * weights = cgraph->nodes[node_idx+8];
|
||||||
|
|
||||||
@@ -2836,7 +2836,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ops.size() == topk_moe_ops.size() &&
|
if (ops.size() == topk_moe_ops.size() &&
|
||||||
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
|
||||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||||
ggml_tensor * weights = cgraph->nodes[node_idx+4];
|
ggml_tensor * weights = cgraph->nodes[node_idx+4];
|
||||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||||
@@ -2845,7 +2845,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
|
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
|
||||||
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_delayed_softmax, { node_idx + 2, node_idx + 5 })) {
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
|
||||||
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
||||||
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user