mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-16 11:27:03 +00:00
CUDA: larger SRAM reads for tile FA, AMD FP16 dot (#15927)
* CUDA: larger SRAM reads for tile FA, AMD FP16 dot * fix logic for availability of v_dot2_f32_f16
This commit is contained in:
@@ -555,7 +555,7 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
|
||||
#if defined(GGML_USE_HIP) && defined(GCN)
|
||||
#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA))
|
||||
asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u));
|
||||
#else
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
@@ -567,7 +567,21 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
|
||||
acc += tmpv.x * tmpu.x;
|
||||
acc += tmpv.y * tmpu.y;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
#endif // defined(GGML_USE_HIP) && defined(GCN)
|
||||
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
|
||||
}
|
||||
|
||||
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
|
||||
template <int nbytes>
|
||||
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
|
||||
if constexpr (nbytes == 4) {
|
||||
*(int *) dst = *(const int *) src;
|
||||
} else if constexpr (nbytes == 8) {
|
||||
*(int2 *) dst = *(const int2 *) src;
|
||||
} else if constexpr (nbytes == 16) {
|
||||
*(int4 *) dst = *(const int4 *) src;
|
||||
} else {
|
||||
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
|
||||
Reference in New Issue
Block a user