mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml: SVE support for exponential functions (#15145)
* SVE support for exponential functions Add const notation to variable pg * Update ggml/src/ggml-cpu/vec.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Add const --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -321,6 +321,12 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) { | ||||
|     for (; i + 3 < n; i += 4) { | ||||
|         _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i))); | ||||
|     } | ||||
| #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) | ||||
|     const int vlen = svcntw(); | ||||
|     for (; i < n; i += vlen) { | ||||
|         const svbool_t pg = svwhilelt_b32_s32(i, n); | ||||
|         svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i))); | ||||
|     } | ||||
| #elif defined(__ARM_NEON) && defined(__aarch64__) | ||||
|     for (; i + 3 < n; i += 4) { | ||||
|         vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i))); | ||||
| @@ -345,6 +351,12 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * | ||||
|     for (; i + 3 < n; i += 4) { | ||||
|         _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i))); | ||||
|     } | ||||
| #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) | ||||
|     const int vlen = svcntw(); | ||||
|     for (; i < n; i += vlen) { | ||||
|         const svbool_t pg = svwhilelt_b32_s32(i, n); | ||||
|         svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i))); | ||||
|     } | ||||
| #elif defined(__ARM_NEON) && defined(__aarch64__) | ||||
|     for (; i + 3 < n; i += 4) { | ||||
|         vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i))); | ||||
| @@ -392,6 +404,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float | ||||
| #endif | ||||
|         sum += (ggml_float)_mm_cvtss_f32(val); | ||||
|     } | ||||
| #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) | ||||
|     const int vlen = svcntw(); | ||||
|     for (; i < n; i += vlen) { | ||||
|         const svbool_t pg = svwhilelt_b32_s32(i, n); | ||||
|         svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i), | ||||
|                                                 svdup_n_f32_x(pg, max))); | ||||
|         svst1_f32(pg, y + i, val); | ||||
|         sum += (ggml_float)svaddv_f32(pg, val); | ||||
|     } | ||||
| #elif defined(__ARM_NEON) && defined(__aarch64__) | ||||
|     for (; i + 3 < n; i += 4) { | ||||
|         float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i), | ||||
|   | ||||
| @@ -1002,7 +1002,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| #if defined(__ARM_NEON) && defined(__aarch64__) | ||||
| #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__) | ||||
|  | ||||
| inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) { | ||||
|     const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f); | ||||
|     const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f); | ||||
|     const svfloat32_t n = svsub_f32_x(pg, z, r); | ||||
|     const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f); | ||||
|     const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23); | ||||
|     const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1)))); | ||||
|     const svbool_t c = svacgt_n_f32(pg, n, 126); | ||||
|     const svfloat32_t u = svmul_f32_x(pg, b, b); | ||||
|     const svfloat32_t j = svmla_f32_x(pg, | ||||
|         svmul_n_f32_x(pg, b, 0x1.ffffecp-1f), | ||||
|         svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b), | ||||
|                         svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u); | ||||
|     const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000); | ||||
|     const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000)); | ||||
|     const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d)); | ||||
|     return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1), | ||||
|                      svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j))); | ||||
| } | ||||
|  | ||||
| // computes silu x/(1+exp(-x)) in single precision vector | ||||
| inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) { | ||||
|     const svfloat32_t one = svdup_n_f32_x(pg, 1.0f); | ||||
|     const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f); | ||||
|     const svfloat32_t neg_x = svsub_f32_x(pg, zero, x); | ||||
|     const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x); | ||||
|     const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x); | ||||
|     return svdiv_f32_x(pg, x, one_plus_exp_neg_x); | ||||
| } | ||||
|  | ||||
| #elif defined(__ARM_NEON) && defined(__aarch64__) | ||||
|  | ||||
| // adapted from arm limited optimized routine | ||||
| // the maximum error is 1.45358 plus 0.5 ulps | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 s-goto-11
					s-goto-11