mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	ggml : LoongArch fixes (#16958)
* Fix test-quantize-fns f16 and q4_0 failed when use LSX * Fix LoongArch set float intrinsic when use LSX/LASX
This commit is contained in:
		@@ -700,7 +700,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 | 
			
		||||
    for (; ib + 1 < nb; ib += 2) {
 | 
			
		||||
 | 
			
		||||
        // Compute combined scale for the block 0 and 1
 | 
			
		||||
        const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
 | 
			
		||||
        const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
 | 
			
		||||
        const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
 | 
			
		||||
 | 
			
		||||
        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
 | 
			
		||||
 | 
			
		||||
@@ -714,11 +715,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 | 
			
		||||
        bx_1 = __lsx_vsub_b(bx_1, off);
 | 
			
		||||
        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
 | 
			
		||||
 | 
			
		||||
        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
 | 
			
		||||
        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
 | 
			
		||||
 | 
			
		||||
        // Compute combined scale for the block 2 and 3
 | 
			
		||||
        const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
 | 
			
		||||
        const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
 | 
			
		||||
        const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
 | 
			
		||||
 | 
			
		||||
        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(__loongarch_asx)
 | 
			
		||||
#if defined(__loongarch_sx)
 | 
			
		||||
/* float type data load instructions */
 | 
			
		||||
static __m128 __lsx_vreplfr2vr_s(const float val) {
 | 
			
		||||
    v4f32 res = {val, val, val, val};
 | 
			
		||||
    return (__m128)res;
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(__loongarch_asx)
 | 
			
		||||
static __m256 __lasx_xvreplfr2vr_s(const float val) {
 | 
			
		||||
    v8f32 res = {val, val, val, val, val, val, val, val};
 | 
			
		||||
    return (__m256)res;
 | 
			
		||||
 
 | 
			
		||||
@@ -956,7 +956,7 @@ do {                                                              \
 | 
			
		||||
 | 
			
		||||
#define GGML_F32Cx8          __m256
 | 
			
		||||
#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
 | 
			
		||||
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
 | 
			
		||||
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
 | 
			
		||||
 | 
			
		||||
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
 | 
			
		||||
    __m256i a;
 | 
			
		||||
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 | 
			
		||||
 | 
			
		||||
#define GGML_F32x4         __m128
 | 
			
		||||
#define GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
 | 
			
		||||
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
 | 
			
		||||
#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
 | 
			
		||||
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
 | 
			
		||||
#define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
 | 
			
		||||
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 | 
			
		||||
#define GGML_F32x4_ADD     __lsx_vfadd_s
 | 
			
		||||
#define GGML_F32x4_MUL     __lsx_vfmul_s
 | 
			
		||||
#define GGML_F32x4_REDUCE(res, x)                                                     \
 | 
			
		||||
{                                                                                     \
 | 
			
		||||
    int offset = GGML_F32_ARR >> 1;                                                   \
 | 
			
		||||
    for (int i = 0; i < offset; ++i) {                                                \
 | 
			
		||||
        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
 | 
			
		||||
    }                                                                                 \
 | 
			
		||||
    offset >>= 1;                                                                     \
 | 
			
		||||
    for (int i = 0; i < offset; ++i) {                                                \
 | 
			
		||||
        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
 | 
			
		||||
    }                                                                                 \
 | 
			
		||||
    offset >>= 1;                                                                     \
 | 
			
		||||
    for (int i = 0; i < offset; ++i) {                                                \
 | 
			
		||||
        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
 | 
			
		||||
    }                                                                                 \
 | 
			
		||||
    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
 | 
			
		||||
    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
 | 
			
		||||
    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
 | 
			
		||||
    const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88);                                     \
 | 
			
		||||
    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
 | 
			
		||||
    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
 | 
			
		||||
    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
 | 
			
		||||
    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
 | 
			
		||||
 | 
			
		||||
#define GGML_F32x4_REDUCE(res, x)                               \
 | 
			
		||||
{                                                               \
 | 
			
		||||
    int offset = GGML_F32_ARR >> 1;                             \
 | 
			
		||||
    for (int i = 0; i < offset; ++i) {                          \
 | 
			
		||||
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
 | 
			
		||||
    }                                                           \
 | 
			
		||||
    offset >>= 1;                                               \
 | 
			
		||||
    for (int i = 0; i < offset; ++i) {                          \
 | 
			
		||||
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
 | 
			
		||||
    }                                                           \
 | 
			
		||||
    offset >>= 1;                                               \
 | 
			
		||||
    for (int i = 0; i < offset; ++i) {                          \
 | 
			
		||||
        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
 | 
			
		||||
    }                                                           \
 | 
			
		||||
    __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
 | 
			
		||||
    __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
 | 
			
		||||
    __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1);          \
 | 
			
		||||
    __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2);     \
 | 
			
		||||
    __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2);     \
 | 
			
		||||
    __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4);          \
 | 
			
		||||
    res = (ggml_float) ((v4f32)t5)[0];                          \
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define GGML_F32_VEC        GGML_F32x4
 | 
			
		||||
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 | 
			
		||||
 | 
			
		||||
#define GGML_F32Cx4             __m128
 | 
			
		||||
#define GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
 | 
			
		||||
#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
 | 
			
		||||
#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vreplfr2vr_s((x))
 | 
			
		||||
#define GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
 | 
			
		||||
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
 | 
			
		||||
#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user