From e427af75fb27682167218d8a1d2fea13e8fe0e22 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 8 Jul 2025 23:19:16 +0200 Subject: [PATCH] add more simd --- ggml/src/ggml-cpu/vec.h | 66 +++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index e0109be51d..78c7ed2d15 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -352,27 +352,61 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int } inline static void ggml_vec_mad1_f32(const int n, float * y, const float s, const float b) { -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F32_STEP - 1)); +#if defined(GGML_USE_ACCELERATE) + vDSP_vsmul(y, 1, &s, y, 1, n); + vDSP_vsadd(y, 1, &b, y, 1, n); +#elif defined(GGML_SIMD) + #if defined(__ARM_FEATURE_SVE) + const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; + const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 + const int ggml_f32_step = 2 * ggml_f32_epr; - GGML_F32_VEC vs = GGML_F32_VEC_SET1(s); - GGML_F32_VEC vb = GGML_F32_VEC_SET1(b); + GGML_F32_VEC vs = GGML_F32_VEC_SET1(s); + GGML_F32_VEC vb = GGML_F32_VEC_SET1(b); - GGML_F32_VEC ay[GGML_F32_ARR]; + const int np = (n & ~(ggml_f32_step - 1)); + svfloat32_t ay1; + svfloat32_t ay2; + for (int i = 0; i < np; i += ggml_f32_step) { + ay1 = GGML_F32_VEC_LOAD(y + i); + ay1 = GGML_F32_VEC_FMA(ay1, vs, vb); + GGML_F32_VEC_STORE(y + i, ay1); - for (int i = 0; i < np; i += GGML_F32_STEP) { - for (int j = 0; j < GGML_F32_ARR; j++) { - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb); - - GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); + ay2 = GGML_F32_VEC_FMA(ay2, vs, vb); + GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); } - } + // leftovers + // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only + if (np < n) { + svbool_t pg = svwhilelt_b32(np, n); + ay1 = svld1_f32(pg, y + np); + ay1 = svmul_f32_m(pg, ay1, vs); + ay1 = svadd_f32_m(pg, ay1, vb); + svst1_f32(pg, y + np, ay1); + } + #else + const int np = (n & ~(GGML_F32_STEP - 1)); - // leftovers - for (int i = np; i < n; ++i) { - y[i] = y[i]*s + b; - } + GGML_F32_VEC vs = GGML_F32_VEC_SET1(s); + GGML_F32_VEC vb = GGML_F32_VEC_SET1(b); + + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb); + + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] = y[i]*s + b; + } + #endif #else // scalar for (int i = 0; i < n; ++i) {