feat: Use a secondary simd_sum instead of a for loop

Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-11-05 09:36:52 +00:00 · 2025-07-22 09:13:45 -06:00
parent 3866f766fe
commit 641276a816
1 changed files with 10 additions and 7 deletions
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1840,15 +1840,18 @@ kernel void kernel_ssm_scan_f32_group(
        // sum of the individual simd groups.
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        // Sum the simd buckets => threadgroup sum
+        // For simd group 0 at indices < num simd groups, extract the shared
        // simd sum
        sumf = 0.0f;
-        for (int64_t i0 = 0; i0 < sgptg; ++i0) {
+        if (sgitg == 0) {
-            sumf += shared[i0];
+            if (tiisg < sgptg) {
                sumf = shared[tiisg];
            }
-
+            sumf = simd_sum(sumf);
-        threadgroup_barrier(mem_flags::mem_threadgroup);
+            if (tiisg == 0) {
                y[0] = sumf;
            }
        }
        // recurse
        s0 = s;