ggml : add ggml_scale_bias (#14417)

* ggml : add ggml_scale_bias * ggml_vec_mad1_f32 * add more simd * add CUDA * sycl * vulkan * cann (placeholder) * opencl * will this fix cpu? * fix cuda * suggestions from coderabbit * fix cann compile error * vDSP_vsmsa * rm __ARM_FEATURE_SVE * use memcpy for op params * make code looks more consistent * use scalar for __ARM_FEATURE_SVE * add x param to ggml_vec_mad1_f32
2025-11-20 12:07:33 +00:00 · 2025-07-09 18:16:12 +02:00
parent 26a48ad699
commit 98bab638fb
14 changed files with 139 additions and 38 deletions
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -2256,7 +2256,9 @@ static bool ggml_metal_encode_node(
                GGML_ASSERT(ggml_is_contiguous(src0));

                float scale;
-                memcpy(&scale, dst->op_params, sizeof(scale));
+                float bias;
+                memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(float));
+                memcpy(&bias,  ((const int32_t *) dst->op_params) + 1, sizeof(float));

                int64_t n = ggml_nelements(dst);

@@ -2273,6 +2275,7 @@ static bool ggml_metal_encode_node(
                [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
                [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+                [encoder setBytes:&bias  length:sizeof(bias)  atIndex:3];

                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
            } break;
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1014,16 +1014,18 @@ kernel void kernel_scale(
        device const float * src0,
        device       float * dst,
        constant     float & scale,
+        constant     float & bias,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale;
+    dst[tpig] = src0[tpig] * scale + bias;
 }

 kernel void kernel_scale_4(
        device const float4 * src0,
        device       float4 * dst,
        constant     float  & scale,
+        constant     float  & bias,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale;
+    dst[tpig] = src0[tpig] * scale + bias;
 }

 kernel void kernel_clamp(