diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index a427de404e..7c50271732 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -4671,10 +4671,7 @@ static void ggml_compute_forward_scale_f32(
             // src0 is same shape as dst => same indices
             memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
         }
-        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s);
-        if (b != 0.0f) {
-            ggml_vec_acc1_f32(nc, (float *) ((char *) dst->data + i1*nb1), b);
-        }
+        ggml_vec_mad1_f32(nc, (float *) ((char *) dst->data + i1*nb1), s, b);
     }
 }
 
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index 1f5857a23e..e0109be51d 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -351,6 +351,36 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
 #endif
 }
 
+inline static void ggml_vec_mad1_f32(const int n, float * y, const float s, const float b) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
+    GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
+
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i]  = y[i]*s + b;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] *= y[i]*s + b;
+    }
+#endif
+}
+
 //inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(GGML_USE_ACCELERATE)