mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-08 10:07:01 +00:00
CUDA: replace GGML_CUDA_F16 with CUDA arch checks (#15433)
This commit is contained in:
@@ -87,7 +87,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
||||
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
|
||||
const float d4d8 = tmp.x;
|
||||
const float m4s8 = tmp.y;
|
||||
@@ -96,7 +96,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
||||
const float2 ds8f = __half22float2(ds8);
|
||||
const float d4d8 = dm4f.x * ds8f.x;
|
||||
const float m4s8 = dm4f.y * ds8f.y;
|
||||
#endif // GGML_CUDA_F16
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
||||
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
||||
@@ -158,7 +158,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
||||
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
|
||||
const float d5d8 = tmp.x;
|
||||
const float m5s8 = tmp.y;
|
||||
@@ -167,7 +167,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
||||
const float2 ds8f = __half22float2(ds8);
|
||||
const float d5d8 = dm5f.x * ds8f.x;
|
||||
const float m5s8 = dm5f.y * ds8f.y;
|
||||
#endif // GGML_CUDA_F16
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
||||
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
||||
@@ -201,7 +201,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
||||
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
|
||||
const float d8d8 = tmp.x;
|
||||
const float m8s8 = tmp.y;
|
||||
@@ -210,7 +210,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
||||
const float2 ds8f = __half22float2(ds8);
|
||||
const float d8d8 = dm8f.x * ds8f.x;
|
||||
const float m8s8 = dm8f.y * ds8f.y;
|
||||
#endif // GGML_CUDA_F16
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
||||
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
||||
|
||||
Reference in New Issue
Block a user