CUDA: skip fusion for repeating adds in bias (#17080)

This commit is contained in:
Aman Gupta
2025-11-08 16:58:05 +08:00
committed by GitHub
parent b8a5cfd11a
commit c1b187688d
3 changed files with 18 additions and 6 deletions

View File

@@ -3152,8 +3152,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
#ifdef GGML_CUDA_DEBUG
const int nodes_fused = i - prev_i - 1;
prev_i = i;
@@ -3302,6 +3300,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}
// we don't support repeating adds
if (bias_op == GGML_OP_ADD &&
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
continue;
}
const ggml_tensor * src0 = up_n->src[0];
const ggml_tensor * src1 = up_n->src[1];
const ggml_tensor * ids = up_n->src[2];
@@ -3411,6 +3416,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
continue;
}
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.x_bias = bias_tensor;