mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	HIP: Prepare reduction operators for wave 64
This commit is contained in:
		@@ -190,53 +190,46 @@ static __device__ void no_device_code(
 | 
			
		||||
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
 | 
			
		||||
#endif // __CUDA_ARCH__
 | 
			
		||||
 | 
			
		||||
template<int width = WARP_SIZE>
 | 
			
		||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | 
			
		||||
    return __reduce_add_sync(0xffffffff, x);
 | 
			
		||||
#else
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int offset = 16; offset > 0; offset >>= 1) {
 | 
			
		||||
        x += __shfl_xor_sync(0xffffffff, x, offset, 32);
 | 
			
		||||
    for (int offset = width/2; offset > 0; offset >>= 1) {
 | 
			
		||||
        x += __shfl_xor_sync(0xffffffff, x, offset, width);
 | 
			
		||||
    }
 | 
			
		||||
    return x;
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<int width = WARP_SIZE>
 | 
			
		||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int offset = 16; offset > 0; offset >>= 1) {
 | 
			
		||||
        x += __shfl_xor_sync(0xffffffff, x, offset, 32);
 | 
			
		||||
    for (int offset = width/2; offset > 0; offset >>= 1) {
 | 
			
		||||
        x += __shfl_xor_sync(0xffffffff, x, offset, width);
 | 
			
		||||
    }
 | 
			
		||||
    return x;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<int width = WARP_SIZE>
 | 
			
		||||
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int offset = 16; offset > 0; offset >>= 1) {
 | 
			
		||||
        a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32);
 | 
			
		||||
        a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32);
 | 
			
		||||
    for (int offset = width/2; offset > 0; offset >>= 1) {
 | 
			
		||||
        a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width);
 | 
			
		||||
        a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width);
 | 
			
		||||
    }
 | 
			
		||||
    return a;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<int width = WARP_SIZE>
 | 
			
		||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 | 
			
		||||
#ifdef FP16_AVAILABLE
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int offset = 16; offset > 0; offset >>= 1) {
 | 
			
		||||
        const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32);
 | 
			
		||||
        reinterpret_cast<half&>(a.x) +=  __low2half(a_other);
 | 
			
		||||
        reinterpret_cast<half&>(a.y) += __high2half(a_other);
 | 
			
		||||
    for (int offset = width/2; offset > 0; offset >>= 1) {
 | 
			
		||||
        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
 | 
			
		||||
    }
 | 
			
		||||
    return a;
 | 
			
		||||
#else
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int offset = 16; offset > 0; offset >>= 1) {
 | 
			
		||||
        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32));
 | 
			
		||||
    }
 | 
			
		||||
    return a;
 | 
			
		||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
    NO_DEVICE_CODE;
 | 
			
		||||
@@ -244,10 +237,11 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 | 
			
		||||
#endif // FP16_AVAILABLE
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<int width = WARP_SIZE>
 | 
			
		||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int offset = 16; offset > 0; offset >>= 1) {
 | 
			
		||||
        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
 | 
			
		||||
    for (int offset = width/2; offset > 0; offset >>= 1) {
 | 
			
		||||
        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width));
 | 
			
		||||
    }
 | 
			
		||||
    return x;
 | 
			
		||||
}
 | 
			
		||||
@@ -269,35 +263,34 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION >= CUDART_HMAX
 | 
			
		||||
#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
 | 
			
		||||
    return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
 | 
			
		||||
#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
 | 
			
		||||
    return __hmax2(a, b);
 | 
			
		||||
#else
 | 
			
		||||
#elif !defined(GGML_USE_HIP)
 | 
			
		||||
    half2 ret;
 | 
			
		||||
    reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a),  __low2float(b)));
 | 
			
		||||
    reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
 | 
			
		||||
    return ret;
 | 
			
		||||
#endif // CUDART_VERSION >= CUDART_HMAX
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
    GGML_UNUSED(a);
 | 
			
		||||
    GGML_UNUSED(b);
 | 
			
		||||
    NO_DEVICE_CODE;
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<int width = WARP_SIZE>
 | 
			
		||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
 | 
			
		||||
#pragma unroll
 | 
			
		||||
   for (int offset = 16; offset > 0; offset >>= 1) {
 | 
			
		||||
       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
 | 
			
		||||
   for (int offset = width/2; offset > 0; offset >>= 1) {
 | 
			
		||||
       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
 | 
			
		||||
   }
 | 
			
		||||
   return x;
 | 
			
		||||
#else
 | 
			
		||||
   GGML_UNUSED(x);
 | 
			
		||||
   NO_DEVICE_CODE;
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION < CUDART_HMASK
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user