CUDA: replace GGML_CUDA_F16 with CUDA arch checks (#15433)

This commit is contained in:
Johannes Gäßler
2025-08-20 16:58:49 +02:00
committed by GitHub
parent fec9519802
commit 7a6e91ad26
12 changed files with 32 additions and 86 deletions

View File

@@ -87,7 +87,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
}
#ifdef GGML_CUDA_F16
#ifdef FAST_FP16_AVAILABLE
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
const float d4d8 = tmp.x;
const float m4s8 = tmp.y;
@@ -96,7 +96,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
const float2 ds8f = __half22float2(ds8);
const float d4d8 = dm4f.x * ds8f.x;
const float m4s8 = dm4f.y * ds8f.y;
#endif // GGML_CUDA_F16
#endif // FAST_FP16_AVAILABLE
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
@@ -158,7 +158,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
}
#ifdef GGML_CUDA_F16
#ifdef FAST_FP16_AVAILABLE
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
const float d5d8 = tmp.x;
const float m5s8 = tmp.y;
@@ -167,7 +167,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
const float2 ds8f = __half22float2(ds8);
const float d5d8 = dm5f.x * ds8f.x;
const float m5s8 = dm5f.y * ds8f.y;
#endif // GGML_CUDA_F16
#endif // FAST_FP16_AVAILABLE
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
@@ -201,7 +201,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
}
#ifdef GGML_CUDA_F16
#ifdef FAST_FP16_AVAILABLE
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
const float d8d8 = tmp.x;
const float m8s8 = tmp.y;
@@ -210,7 +210,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
const float2 ds8f = __half22float2(ds8);
const float d8d8 = dm8f.x * ds8f.x;
const float m8s8 = dm8f.y * ds8f.y;
#endif // GGML_CUDA_F16
#endif // FAST_FP16_AVAILABLE
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);