mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
ggml : fix loongarch lsx compilation error (#15864)
This commit is contained in:
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|||||||
|
|
||||||
return ((v4f32)res)[0];
|
return ((v4f32)res)[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// multiply int8_t, add results pairwise twice
|
||||||
|
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
||||||
|
// Get absolute values of x vectors
|
||||||
|
const __m128i ax = __lsx_vsigncov_b(x, x);
|
||||||
|
// Sign the values of the y vectors
|
||||||
|
const __m128i sy = __lsx_vsigncov_b(x, y);
|
||||||
|
// Perform multiplication and create 16-bit values
|
||||||
|
const __m128i dot = lsx_maddubs_h(ax, sy);
|
||||||
|
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
||||||
|
return lsx_madd_h(ones, dot);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__loongarch_asx)
|
#if defined(__loongarch_asx)
|
||||||
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// multiply int8_t, add results pairwise twice
|
|
||||||
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
|
||||||
// Get absolute values of x vectors
|
|
||||||
const __m128i ax = __lsx_vsigncov_b(x, x);
|
|
||||||
// Sign the values of the y vectors
|
|
||||||
const __m128i sy = __lsx_vsigncov_b(x, y);
|
|
||||||
// Perform multiplication and create 16-bit values
|
|
||||||
const __m128i dot = lsx_maddubs_h(ax, sy);
|
|
||||||
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
|
||||||
return lsx_madd_h(ones, dot);
|
|
||||||
}
|
|
||||||
|
|
||||||
// horizontally add 8 floats
|
// horizontally add 8 floats
|
||||||
static inline float hsum_float_8(const __m256 x) {
|
static inline float hsum_float_8(const __m256 x) {
|
||||||
__m128 res = lasx_extractf128(x, 1);
|
__m128 res = lasx_extractf128(x, 1);
|
||||||
|
|||||||
@@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|||||||
#define GGML_F32_EPR 4
|
#define GGML_F32_EPR 4
|
||||||
|
|
||||||
#define GGML_F32x4 __m128
|
#define GGML_F32x4 __m128
|
||||||
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
|
||||||
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||||
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
|
||||||
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
||||||
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
||||||
#define GGML_F32x4_ADD __lsx_vfadd_s
|
#define GGML_F32x4_ADD __lsx_vfadd_s
|
||||||
@@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|||||||
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
||||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
||||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||||
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
|
||||||
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
||||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
||||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||||
@@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
|||||||
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
||||||
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
||||||
|
|
||||||
return __lsx_vld(tmp, 0);
|
return (__m128)__lsx_vld(tmp, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||||
@@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_F32Cx4 __m128
|
#define GGML_F32Cx4 __m128
|
||||||
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
|
||||||
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||||
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
|
||||||
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
||||||
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
||||||
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
||||||
|
|||||||
Reference in New Issue
Block a user