From e427af75fb27682167218d8a1d2fea13e8fe0e22 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 8 Jul 2025 23:19:16 +0200
Subject: [PATCH] add more simd

---
 ggml/src/ggml-cpu/vec.h | 66 +++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index e0109be51d..78c7ed2d15 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -352,27 +352,61 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
 }
 
 inline static void ggml_vec_mad1_f32(const int n, float * y, const float s, const float b) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
+#if defined(GGML_USE_ACCELERATE)
+    vDSP_vsmul(y, 1, &s, y, 1, n);
+    vDSP_vsadd(y, 1, &b, y, 1, n);
+#elif defined(GGML_SIMD)
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
+        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
+        const int ggml_f32_step = 2 * ggml_f32_epr;
 
-    GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
-    GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
+        GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
+        GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
 
-    GGML_F32_VEC ay[GGML_F32_ARR];
+        const int np = (n & ~(ggml_f32_step - 1));
+        svfloat32_t ay1;
+        svfloat32_t ay2;
+        for (int i = 0; i < np; i += ggml_f32_step) {
+            ay1 = GGML_F32_VEC_LOAD(y + i);
+            ay1 = GGML_F32_VEC_FMA(ay1, vs, vb);
+            GGML_F32_VEC_STORE(y + i, ay1);
 
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
+            ay2 = GGML_F32_VEC_FMA(ay2, vs, vb);
+            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
         }
-    }
+        // leftovers
+        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
+        if (np < n) {
+            svbool_t pg = svwhilelt_b32(np, n);
+            ay1 = svld1_f32(pg, y + np);
+            ay1 = svmul_f32_m(pg, ay1, vs);
+            ay1 = svadd_f32_m(pg, ay1, vb);
+            svst1_f32(pg, y + np, ay1);
+        }
+    #else
+        const int np = (n & ~(GGML_F32_STEP - 1));
 
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i]  = y[i]*s + b;
-    }
+        GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
+        GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
+
+        GGML_F32_VEC ay[GGML_F32_ARR];
+
+        for (int i = 0; i < np; i += GGML_F32_STEP) {
+            for (int j = 0; j < GGML_F32_ARR; j++) {
+                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
+
+                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            }
+        }
+
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i]  = y[i]*s + b;
+        }
+    #endif
 #else
     // scalar
     for (int i = 0; i < n; ++i) {