CANN: Refactor to reduce duplicate code (#12731)

* CANN: Refactor to reduce duplicate code * CANN: fix review comment
2025-10-30 08:42:00 +00:00 · 2025-04-07 17:10:36 +08:00
parent 916c83bfe7
commit d0d5b2232b
3 changed files with 482 additions and 1245 deletions
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -31,20 +31,25 @@
 * IN THE SOFTWARE.
 */
-#include <aclnnop/aclnn_add.h>
+#include <aclnnop/aclnn_abs.h>
 #include <aclnnop/aclnn_neg.h>
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_arange.h>
 #include <aclnnop/aclnn_argsort.h>
 #include <aclnnop/aclnn_cat.h>
 #include <aclnnop/aclnn_clamp.h>
 #include <aclnnop/aclnn_div.h>
 #include <aclnnop/aclnn_gelu.h>
 #include <aclnnop/aclnn_gelu_v2.h>
 #include <aclnnop/aclnn_sigmoid.h>
 #include <aclnnop/aclnn_hardsigmoid.h>
 #include <aclnnop/aclnn_hardswish.h>
 #include <aclnnop/aclnn_leaky_relu.h>
 #include <aclnnop/aclnn_mul.h>
 #include <aclnnop/aclnn_relu.h>
 #include <aclnnop/aclnn_silu.h>
 #include <aclnnop/aclnn_tanh.h>
 #include <aclnnop/aclnn_sqrt.h>
 #include <aclnnop/aclnn_sin.h>
 #include <aclnnop/aclnn_cos.h>
 #include "acl_tensor.h"
 #include "common.h"
@@ -63,23 +68,6 @@
 */
 void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Adds two ggml tensors using the CANN backend.
 *
 * @details This function performs an element-wise addition of two tensors. In
 *          case the tensors do not have the same shape, one or both tensors
 *          will be broadcasted to match the shape of the other before the
 *          addition is performed.The formula for the operation is given by:
 *          \f[
 *              \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1}
 *          \f]
 *
 * @param ctx The CANN context used for operations.
 * @param dst The ggml tensor representing the destination, result of the
 *            addition is stored at dst->data, and dst->op is `GGML_OP_ADD`
 */
 void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
 *          backend.
@@ -131,19 +119,6 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 */
 void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the square of the elements of a ggml tensor using the CANN
 *          backend.
 * @details The function sets the second source tensor of the destination
 *          tensor `dst` to be equal to the first source tensor. This is
 *          effectively squaring the elements since the multiplication becomes
 *          `element * element`.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the squared values will be stored，
 *            which dst->op is `GGML_OP_SQR`.
 */
 void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies a clamp operation to the elements of a ggml tensor using the
 *          CANN backend.
@@ -275,6 +250,20 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 */
 void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the sum of elements in a ggml tensor.
 *
 * @details This function performs a reduction sum operation along the last
 *          dimension of the input tensor `src`. The result of the sum is stored
 *          in the destination tensor `dst`.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the reduced values will be stored。
 *
 */
 void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
 *          the CANN backend.
@@ -500,128 +489,247 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
- * @brief   Computes the cosine of each element in a ggml tensor using the CANN backend.
+ * @brief Adds two tensors element-wise and stores the result in a destination
 * tensor.
 *
- * @details This function applies the cosine function element-wise to the input tensor.
+ * This function performs the operation:
- *          The computed cosine values are stored in the destination tensor `dst`.
+ * \f[
- *          The operation is optimized using the CANN backend for improved performance.
+ *    dst = acl\_src0 + alpha \times acl\_src1
 * \f]
 * where alpha is a scalar value and defaults to 1.0f.
 *
- * @param ctx The CANN context used for operations.
+ * @param ctx The context for the CANN backend operations.
- * @param dst The destination tensor where the cosine values will be stored.
+ * @param acl_src0 The first source tensor.
- *            dst->op is `GGML_OP_COS`.
+ * @param acl_src1 The second source tensor.
 * @param acl_dst The destination tensor where the result will be stored.
 */
-void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
    aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
 /**
- * @brief   Computes the sine of each element in a ggml tensor using the CANN backend.
+ * @brief Sub two tensors element-wise and stores the result in a destination
 * tensor.
 *
- * @details This function applies the sine function element-wise to the input tensor.
+ * This function performs the operation:
- *          The computed sine values are stored in the destination tensor `dst`.
+ * \f[
- *          The operation is optimized using the CANN backend for improved performance.
+ *    dst = acl\_src0 - alpha \times acl\_src1
 * \f]
 * where alpha is a scalar value and defaults to 1.0f.
 *
- * @param ctx The CANN context used for operations.
+ * @param ctx The context for the CANN backend operations.
- * @param dst The destination tensor where the sine values will be stored.
+ * @param acl_src0 The first source tensor.
- *            dst->op is `GGML_OP_SIN`.
+ * @param acl_src1 The second source tensor.
 * @param acl_dst The destination tensor where the result will be stored.
 */
-void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
    aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
-template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
+/**
-                                       aclTensor*, uint64_t*, aclOpExecutor**),
+ * @brief Performs element-wise multiplication of two tensors and stores the
-          aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
+ * result in a destination tensor.
-void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+ *
 * This function performs element-wise multiplication of the tensors `acl_src`
 * and `acl_other` and stores the result in the destination tensor `acl_dst`.
 * The operation is defined as:
 * \f[
 *     \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
 * \f]
 *
 * @param ctx The context for the CANN backend operations.
 * @param acl_src The first tensor for element-wise multiplication.
 * @param acl_other The second tensor for element-wise multiplication.
 * @param acl_dst The destination tensor where the result will be stored.
 */
 void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    aclTensor* acl_other, aclTensor* acl_dst = nullptr);
 /**
 * @brief Matrix division, optionally in-place.
 *
 * This function division each element of the source tensor `acl_src` by the
 * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
 * If `inplace` is true, `acl_dst` will not be used and the operation is
 * performed in-place on `acl_src`. The operation is defined as: \f[
 *     \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
 * \f]
 *
 * @param ctx The context for the CANN backend operations.
 * @param acl_src Numerator tensor..
 * @param acl_other Denominator tensor.
 * @param acl_dst The destination tensor where the result will be stored if
 * `inplace` is false.
 * @param inplace Flag indicating whether to perform the operation in-place on
 * `acl_src`.
 */
 void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    aclTensor* acl_other, aclTensor* acl_dst = nullptr);
 /**
 * @brief Applies element-wise cosine function to the elements of a tensor.
 *
 * This function computes the cosine of each element in the source tensor
 * `acl_src` and stores the result in the destination tensor `acl_dst`. The
 * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
 * }_i\right) \f]
 *
 * @param ctx The context for the CANN backend operations.
 * @param acl_src The source tensor on which the cosine function will be
 * applied.
 * @param acl_dst The destination tensor where the cosine results will be
 * stored.
 */
 void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    aclTensor* acl_dst);
 /**
 * @brief Applies element-wise sine function to the elements of a tensor.
 *
 * This function computes the sine of each element in the source tensor
 `acl_src`
 * and stores the result in the destination tensor `acl_dst`.
 * The operation is defined as:
 * \f[
 *     \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
 * \f]
 * @param ctx The context for the CANN backend operations.
 * @param acl_src The source tensor on which the sine function will be applied.
 * @param acl_dst The destination tensor where the sine results will be stored.
 */
 void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    aclTensor* acl_dst);
 /**
 * @brief Launches an asynchronous task using the memory allocator.
 *
 * This macro submit an asynchronous task on the specified stream.
 * The task uses memory allocated by the allocator. It is guaranteed
 * that the memory will not be accessed by other tasks until this task
 * completes, due to the sequential execution order within the same stream.
 *
 * @param OP_NAME aclnn operator name.
 * @param args Additional arguments required by the task.
 *
 * @note
 * Memory from the allocator will be "freed" immediately and can be
 * reallocated to other pointers. However, it won't be accessed by any
 * other task before this asynchronous task ends, because all tasks in the
 * same stream are executed in queue order.
 */
 #define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...)                                                \
    do {                                                                                     \
        uint64_t        workspaceSize = 0;                                                   \
        aclOpExecutor * executor;                                                            \
        void *          workspaceAddr = nullptr;                                             \
                                                                                             \
        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
                                                                                             \
        if (workspaceSize > 0) {                                                             \
            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);             \
            workspaceAddr = workspace_allocator.get();                                       \
        }                                                                                    \
        ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream()));     \
    } while (0)
 /**
 * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one output tensor.
 *
 * This function checks whether broadcasting is needed between `src0` and `src1`.
 * If broadcasting is required, it calculates the proper shapes and creates
 * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
 * based on the original tensor shapes.
 *
 * @param src0     The first input tensor (reference shape).
 * @param src1     The second input tensor (possibly broadcasted).
 * @param dst      The destination/output tensor.
 * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
 * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
 * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
 */
 void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
                        aclTensor ** acl_src1, aclTensor ** acl_dst);
 /**
 * @brief Applies a element-wise operation to two input tensors using the CANN backend.
 *
 * This templated function takes a binary operator and applies it to two source tensors
 * associated with the destination tensor. The function handles broadcasting as needed.
 *
 * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
 *         the binary operation to be performed. It must take three arguments:
 *         (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
 *
 * @param ctx The CANN backend context used to manage execution and resources.
 * @param dst The destination tensor.
 */
 template <auto binary_op>
 void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];
    ggml_tensor* src1 = dst->src[1];
    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
    aclTensor* acl_src0;
    aclTensor* acl_src1;
    aclTensor* acl_dst;
    // Need bcast
-    if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
+    bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
-        BCAST_SHAPE(src0, src1)
+    binary_op(ctx, acl_src0, acl_src1, acl_dst);
        acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
        acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
        acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
    } else {
        acl_src0 = ggml_cann_create_tensor(src0);
        acl_src1 = ggml_cann_create_tensor(src1);
        acl_dst = ggml_cann_create_tensor(dst);
    }
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
                               &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    aclrtStream main_stream = ctx.stream();
    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
    ACL_CHECK(aclDestroyTensor(acl_src0));
    ACL_CHECK(aclDestroyTensor(acl_src1));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
-// Activation functions template.
+/**
-template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
+ * @brief Applies a unary operation to an input tensor using the CANN backend.
-                                       aclOpExecutor**),
+ *
-          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
+ * This templated function applies a unary operator to the source tensor of `dst`
-                              const aclrtStream)>
+ * and stores the result in the destination tensor.
-void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+ *
 * @tparam unary_op A callable with the signature:
 *         void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
 *         where the first aclTensor is the source and the second is the destination.
 *
 * @param ctx The CANN backend context for managing resources and execution.
 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
 */
 template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
    void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    uint64_t workspaceSize = 0;
+    unary_op(ctx, acl_src, acl_dst);
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    aclrtStream main_stream = ctx.stream();
    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
-// Activation functions template for const aclTensors.
+/**
-template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
+ * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
-                                       uint64_t*, aclOpExecutor**),
+ *
-          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
+ * This macro defines an inline lambda wrapping a specific ACL operation name,
-                              const aclrtStream)>
+ * and passes it to the templated ggml_cann_unary_op function. It simplifies
-void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+ * calling unary ops by hiding the lambda boilerplate.
-    ggml_tensor* src = dst->src[0];
+ *
-
+ * Internally, the lambda will call:
-    aclTensor* acl_src = ggml_cann_create_tensor(src);
+ * @code
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ * GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);
-
+ * @endcode
-    uint64_t workspaceSize = 0;
+ *
-    aclOpExecutor* executor;
+ * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
-    void* workspaceAddr = nullptr;
+ *
-
+ * @see ggml_cann_unary_op
-    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+ * @see GGML_CANN_CALL_ACLNN_OP
-    if (workspaceSize > 0) {
+ */
-        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+#define GGML_CANN_CALL_UNARY_OP(OP_NAME)                         \
-        workspaceAddr = workspace_allocator.get();
+    do {                                                         \
-    }
+        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { \
-
+            GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);  \
-    aclrtStream main_stream = ctx.stream();
+        };                                                       \
-    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
+        ggml_cann_unary_op<lambda>(ctx, dst);                    \
-
+    }                                                            \
-    ACL_CHECK(aclDestroyTensor(acl_src));
+    while (0)
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 #endif  // CANN_ACLNN_OPS
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1300,47 +1300,59 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_dup(ctx, dst);
            break;
        case GGML_OP_ADD:
-            ggml_cann_add(ctx, dst);
+        case GGML_OP_ADD1:
            ggml_cann_binary_op<aclnn_add>(ctx, dst);
            break;
        case GGML_OP_SUB:
            ggml_cann_binary_op<aclnn_sub>(ctx, dst);
            break;
        case GGML_OP_ACC:
            ggml_cann_acc(ctx, dst);
            break;
        case GGML_OP_MUL:
-            ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
+            ggml_cann_binary_op<aclnn_mul>(ctx, dst);
            break;
        case GGML_OP_DIV:
-            ggml_cann_mul_div<aclnnDivGetWorkspaceSize, aclnnDiv>(ctx, dst);
+            ggml_cann_binary_op<aclnn_div>(ctx, dst);
            break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(dst)) {
                case GGML_UNARY_OP_ABS:
                    GGML_CANN_CALL_UNARY_OP(Abs);
                    break;
                case GGML_UNARY_OP_NEG:
                    GGML_CANN_CALL_UNARY_OP(Neg);
                    break;
                case GGML_UNARY_OP_GELU:
-                    ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
+                    GGML_CANN_CALL_UNARY_OP(Gelu);
                        ctx, dst);
                    break;
                case GGML_UNARY_OP_SILU:
-                    ggml_cann_activation<aclnnSiluGetWorkspaceSize, aclnnSilu>(
+                    GGML_CANN_CALL_UNARY_OP(Silu);
                        ctx, dst);
                    break;
-                // TODO: Use faster gelu??
+                case GGML_UNARY_OP_GELU_QUICK: {
-                case GGML_UNARY_OP_GELU_QUICK:
+                        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) {
-                    ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
+                            GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
-                        ctx, dst);
+                        };
                        ggml_cann_unary_op<lambda>(ctx, dst);
                    }
                    break;
                case GGML_UNARY_OP_TANH:
-                    ggml_cann_activation<aclnnTanhGetWorkspaceSize, aclnnTanh>(
+                    GGML_CANN_CALL_UNARY_OP(Tanh);
                        ctx, dst);
                    break;
                case GGML_UNARY_OP_RELU:
-                    ggml_cann_activation<aclnnReluGetWorkspaceSize, aclnnRelu>(
+                    GGML_CANN_CALL_UNARY_OP(Relu);
-                        ctx, dst);
+                    break;
                case GGML_UNARY_OP_SIGMOID:
                    GGML_CANN_CALL_UNARY_OP(Sigmoid);
                    break;
                case GGML_UNARY_OP_HARDSIGMOID:
-                    ggml_cann_activation<aclnnHardsigmoidGetWorkspaceSize,
+                    GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
                                         aclnnHardsigmoid>(ctx, dst);
                    break;
                case GGML_UNARY_OP_HARDSWISH:
-                    ggml_cann_activation<aclnnHardswishGetWorkspaceSize,
+                    GGML_CANN_CALL_UNARY_OP(Hardswish);
-                                         aclnnHardswish>(ctx, dst);
+                    break;
                case GGML_UNARY_OP_EXP:
                    GGML_CANN_CALL_UNARY_OP(Exp);
                    break;
                default:
                    return false;
@@ -1382,7 +1394,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_scale(ctx, dst);
            break;
        case GGML_OP_SQR:
-            ggml_cann_sqr(ctx, dst);
+            GGML_ASSERT(dst->src[1] == nullptr);
            dst->src[1] = dst->src[0];
            ggml_cann_binary_op<aclnn_mul>(ctx, dst);
            break;
        case GGML_OP_SQRT:
            GGML_CANN_CALL_UNARY_OP(Sqrt);
            break;
        case GGML_OP_CLAMP:
            ggml_cann_clamp(ctx, dst);
@@ -1414,6 +1431,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
        case GGML_OP_POOL_2D:
            ggml_cann_pool2d(ctx, dst);
            break;
        case GGML_OP_SUM:
            ggml_cann_sum(ctx, dst);
            break;
        case GGML_OP_SUM_ROWS:
            ggml_cann_sum_rows(ctx, dst);
            break;
@@ -1424,11 +1444,11 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_argmax(ctx, dst);
            break;
        case GGML_OP_COS:
-            ggml_cann_cos(ctx, dst);
+            ggml_cann_unary_op<aclnn_cos>(ctx, dst);
            break;
        case GGML_OP_SIN:
-            ggml_cann_sin(ctx, dst);
+            ggml_cann_unary_op<aclnn_sin>(ctx, dst);
-            break;
+        break;
        default:
            return false;
    }
@@ -1679,13 +1699,17 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_ABS:
                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
                    return true;
                default:
                    return false;
@@ -1784,6 +1808,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            // value of paddingW should be at most half of kernelW
            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
        }
        case GGML_OP_SUM:
        case GGML_OP_DUP:
        case GGML_OP_IM2COL:
        case GGML_OP_CONCAT:
@@ -1795,11 +1820,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_CLAMP:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX: