mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-12 10:47:01 +00:00
CUDA: Fix bug in topk-moe for gpt-oss (#16821)
* CUDA: Fix bug in topk-moe for gpt-oss When using ggml_can_fuse_subgraph, the output nodes which are passed are wrong. This causes `test-backend-ops` to still fuse ndoes (because the nodes are not used elsewhere in the graph), but it actually doesn't fuse in the actual gpt-oss * fix for qwen3 too * change ifndef to ifdef
This commit is contained in:
@@ -2978,7 +2978,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|||||||
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
|
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
|
||||||
|
|
||||||
if (ops.size() == topk_moe_ops_with_norm.size() &&
|
if (ops.size() == topk_moe_ops_with_norm.size() &&
|
||||||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
|
||||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||||
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
|
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
|
||||||
|
|
||||||
@@ -2997,7 +2997,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
|
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
|
||||||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
|
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
|
||||||
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
|
||||||
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
|
||||||
|
|
||||||
@@ -3118,9 +3118,20 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|||||||
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
||||||
if (!use_cuda_graph || cuda_graph_update_required) {
|
if (!use_cuda_graph || cuda_graph_update_required) {
|
||||||
|
|
||||||
|
[[maybe_unused]] int prev_i = 0;
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_DEBUG
|
||||||
|
const int nodes_fused = i - prev_i - 1;
|
||||||
|
prev_i = i;
|
||||||
|
if (nodes_fused > 0) {
|
||||||
|
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user