mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ggml : simplify Arm fp16 CPU logic (ggml/1177)
* ggml : simlpify Arm fp16 CPU logic ggml-ci * cont : bring back CUDA/MUSA checks ggml-ci
This commit is contained in:
		| @@ -4,13 +4,13 @@ | ||||
|  | ||||
| #include "ggml.h" | ||||
| #include "ggml-impl.h" | ||||
|  | ||||
| #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ | ||||
| //#include <stddef.h> | ||||
| #include <stdbool.h> | ||||
| #include <string.h> // memcpy | ||||
| #include <math.h>   // fabsf | ||||
|  | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
| @@ -69,33 +69,16 @@ struct ggml_compute_params { | ||||
| #endif | ||||
|  | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
| #include <arm_sve.h> | ||||
| #include <sys/prctl.h> | ||||
| #endif | ||||
|  | ||||
| // 16-bit float | ||||
| // on Arm, we use __fp16 | ||||
| // on x86, we use uint16_t | ||||
| #if defined(__ARM_NEON) | ||||
|  | ||||
| // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: | ||||
| // | ||||
| //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ | ||||
| // | ||||
| #include <arm_neon.h> | ||||
|  | ||||
| // ref: https://github.com/ggml-org/llama.cpp/pull/5404 | ||||
| #ifdef _MSC_VER | ||||
|  | ||||
| typedef uint16_t ggml_fp16_internal_t; | ||||
|  | ||||
| #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } | ||||
|  | ||||
| #else | ||||
|  | ||||
| typedef __fp16 ggml_fp16_internal_t; | ||||
|  | ||||
| #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } | ||||
|  | ||||
| #endif // _MSC_VER | ||||
|  | ||||
| #if !defined(__aarch64__) | ||||
|   | ||||
| @@ -71,7 +71,7 @@ | ||||
|     #define GGML_F16x8              float16x8_t | ||||
|     #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f) | ||||
|     #define GGML_F16x8_SET1(x)      vdupq_n_f16(x) | ||||
|     #define GGML_F16x8_LOAD(x)      vld1q_f16((const ggml_fp16_internal_t *)(x)) | ||||
|     #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x)) | ||||
|     #define GGML_F16x8_STORE        vst1q_f16 | ||||
|     #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) | ||||
|     #define GGML_F16x8_ADD          vaddq_f16 | ||||
| @@ -99,7 +99,7 @@ | ||||
|     #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO | ||||
|     #define GGML_F16_VEC_SET1           GGML_F16x8_SET1 | ||||
|     #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p) | ||||
|     #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) | ||||
|     #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i]) | ||||
|     #define GGML_F16_VEC_FMA            GGML_F16x8_FMA | ||||
|     #define GGML_F16_VEC_ADD            GGML_F16x8_ADD | ||||
|     #define GGML_F16_VEC_MUL            GGML_F16x8_MUL | ||||
| @@ -114,7 +114,7 @@ | ||||
|     #define GGML_F32Cx4              float32x4_t | ||||
|     #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f) | ||||
|     #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x) | ||||
|     #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x))) | ||||
|     #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) | ||||
|     #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y)) | ||||
|     #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) | ||||
|     #define GGML_F32Cx4_ADD          vaddq_f32 | ||||
| @@ -125,7 +125,7 @@ | ||||
|     #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO | ||||
|     #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1 | ||||
|     #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p) | ||||
|     #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i]) | ||||
|     #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i]) | ||||
|     #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA | ||||
|     #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD | ||||
|     #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL | ||||
|   | ||||
| @@ -16,14 +16,6 @@ | ||||
| #include <arm_sve.h> | ||||
| #endif // __ARM_FEATURE_SVE | ||||
|  | ||||
| #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) | ||||
| // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: | ||||
| // | ||||
| //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ | ||||
| // | ||||
| #include <arm_neon.h> | ||||
| #endif | ||||
|  | ||||
| #if defined(__F16C__) | ||||
| #include <immintrin.h> | ||||
| #endif | ||||
| @@ -311,29 +303,35 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size); | ||||
|  | ||||
| // FP16 to FP32 conversion | ||||
|  | ||||
| #if defined(__ARM_NEON) | ||||
|     #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) | ||||
|         typedef uint16_t ggml_fp16_internal_t; | ||||
|     #else | ||||
|         typedef __fp16 ggml_fp16_internal_t; | ||||
|     #endif | ||||
| #endif | ||||
| // 16-bit float | ||||
| // on Arm, we use __fp16 | ||||
| // on x86, we use uint16_t | ||||
| // | ||||
| // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 | ||||
| // for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 | ||||
| // | ||||
| #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) | ||||
|  | ||||
|     // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: | ||||
|     // | ||||
|     //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ | ||||
|     // | ||||
|     #include <arm_neon.h> | ||||
|  | ||||
| #if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) | ||||
|     #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) | ||||
|     #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) | ||||
|  | ||||
|     #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) | ||||
|  | ||||
|     static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { | ||||
|         ggml_fp16_internal_t tmp; | ||||
|         __fp16 tmp; | ||||
|         memcpy(&tmp, &h, sizeof(ggml_fp16_t)); | ||||
|         return (float)tmp; | ||||
|     } | ||||
|  | ||||
|     static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { | ||||
|         ggml_fp16_t res; | ||||
|         ggml_fp16_internal_t tmp = f; | ||||
|         __fp16 tmp = f; | ||||
|         memcpy(&res, &tmp, sizeof(ggml_fp16_t)); | ||||
|         return res; | ||||
|     } | ||||
| @@ -485,7 +483,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size); | ||||
|     #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) | ||||
|     #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) | ||||
|  | ||||
| #endif // defined(__ARM_NEON) && (!defined(__MSC_VER) | ||||
| #endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) | ||||
|  | ||||
| // precomputed f32 table for f16 (256 KB) | ||||
| // defined in ggml.c, initialized in ggml_init() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov