diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 3024775135..67af1d8ccc 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -124,6 +124,7 @@ if (CUDAToolkit_FOUND) if (GGML_CUDA_DEBUG) list(APPEND CUDA_FLAGS -lineinfo) + add_compile_definitions(GGML_CUDA_DEBUG) endif() if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 049aece1b5..2d4314fba4 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3152,8 +3152,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - - #ifdef GGML_CUDA_DEBUG const int nodes_fused = i - prev_i - 1; prev_i = i; @@ -3302,6 +3300,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } + // we don't support repeating adds + if (bias_op == GGML_OP_ADD && + (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) || + !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) { + continue; + } + const ggml_tensor * src0 = up_n->src[0]; const ggml_tensor * src1 = up_n->src[1]; const ggml_tensor * ids = up_n->src[2]; @@ -3411,6 +3416,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } + if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) { + continue; + } + ggml_cuda_mm_fusion_args_host fusion_data{}; fusion_data.x_bias = bias_tensor; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 80216f6f3a..31625bcc7a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4984,8 +4984,10 @@ struct test_mul_mat_vec_fusion : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { if (!use_id) { - std::array ne = {k, m, 1, 1}; - std::array ne0 = {k, n, 1, 1}; + const int channels = 4; + const int samples = 2; + std::array ne = { k, m, channels, samples }; + std::array ne0 = { k, n, channels, samples }; ggml_tensor * cur = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data()); ggml_tensor * gate = with_gate ? ggml_new_tensor(ctx, type, 4, ne0.data()) : nullptr; @@ -4993,14 +4995,14 @@ struct test_mul_mat_vec_fusion : public test_case { ggml_tensor * ffn_up = ggml_mul_mat(ctx, up, cur); if (with_bias) { - std::array bias_ne = {ffn_up->ne[0], 1, 1, 1}; + std::array bias_ne = { ffn_up->ne[0], 1, channels, samples }; ggml_tensor * up_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data()); ffn_up = ggml_add(ctx, ffn_up, up_bias); } ggml_tensor * ffn_gate = with_gate ? ggml_mul_mat(ctx, gate, cur) : nullptr; if (with_bias && with_gate) { - std::array bias_ne = {ffn_gate->ne[0], 1, 1, 1}; + std::array bias_ne = { ffn_gate->ne[0], 1, channels, samples }; ggml_tensor * gate_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data()); ffn_gate = ggml_add(ctx, ffn_gate, gate_bias); }