mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	CUDA: fix softmax compile for old CUDA versions (#4862)
This commit is contained in:
		
							
								
								
									
										34
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							| @@ -116,6 +116,8 @@ | |||||||
| #include "ggml.h" | #include "ggml.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
|  |  | ||||||
|  | #define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) | ||||||
|  |  | ||||||
| #define CC_PASCAL     600 | #define CC_PASCAL     600 | ||||||
| #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products | #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products | ||||||
| #define CC_VOLTA      700 | #define CC_VOLTA      700 | ||||||
| @@ -605,16 +607,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { | |||||||
| } | } | ||||||
|  |  | ||||||
| static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { | static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { | ||||||
| #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) | #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL | ||||||
|     (void) a; |  | ||||||
|     bad_arch(); |  | ||||||
| #else |  | ||||||
| #pragma unroll | #pragma unroll | ||||||
|     for (int mask = 16; mask > 0; mask >>= 1) { |     for (int mask = 16; mask > 0; mask >>= 1) { | ||||||
|         a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); |         a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); | ||||||
|     } |     } | ||||||
|     return a; |     return a; | ||||||
| #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) | #else | ||||||
|  |     (void) a; | ||||||
|  |     bad_arch(); | ||||||
|  | #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL | ||||||
| } | } | ||||||
|  |  | ||||||
| static __device__ __forceinline__ float warp_reduce_max(float x) { | static __device__ __forceinline__ float warp_reduce_max(float x) { | ||||||
| @@ -626,16 +628,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) { | |||||||
| } | } | ||||||
|  |  | ||||||
| static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { | static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { | ||||||
| #if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) | #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX | ||||||
|     (void) x; |  | ||||||
|     bad_arch(); |  | ||||||
| #else |  | ||||||
| #pragma unroll | #pragma unroll | ||||||
|     for (int mask = 16; mask > 0; mask >>= 1) { |     for (int mask = 16; mask > 0; mask >>= 1) { | ||||||
|         x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); |         x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); | ||||||
|     } |     } | ||||||
|     return x; |     return x; | ||||||
| #endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) | #else | ||||||
|  |     (void) x; | ||||||
|  |     bad_arch(); | ||||||
|  | #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX | ||||||
| } | } | ||||||
|  |  | ||||||
| static __device__ __forceinline__ float op_repeat(const float a, const float b) { | static __device__ __forceinline__ float op_repeat(const float a, const float b) { | ||||||
| @@ -5613,7 +5615,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int | |||||||
|  |  | ||||||
| template <bool vals_smem, int ncols_template, int block_size_template, bool need_check> | template <bool vals_smem, int ncols_template, int block_size_template, bool need_check> | ||||||
| static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) { | static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) { | ||||||
| #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL | #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX | ||||||
|     const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template; |     const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template; | ||||||
|     const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2; |     const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2; | ||||||
|  |  | ||||||
| @@ -5738,7 +5740,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds | |||||||
| #else | #else | ||||||
|     (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale; |     (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale; | ||||||
|     bad_arch(); |     bad_arch(); | ||||||
| #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL | #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX | ||||||
| } | } | ||||||
|  |  | ||||||
| template <bool vals_smem, int ncols_template, int block_size_template> | template <bool vals_smem, int ncols_template, int block_size_template> | ||||||
| @@ -8574,15 +8576,15 @@ static void ggml_cuda_op_soft_max( | |||||||
|     float scale = 1.0f; |     float scale = 1.0f; | ||||||
|     memcpy(&scale, dst->op_params, sizeof(float)); |     memcpy(&scale, dst->op_params, sizeof(float)); | ||||||
|  |  | ||||||
| #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) | #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX | ||||||
|     const bool use_f16_soft_max = false; |  | ||||||
| #else |  | ||||||
| #ifdef GGML_CUDA_F16 | #ifdef GGML_CUDA_F16 | ||||||
|     const bool use_f16_soft_max = true; |     const bool use_f16_soft_max = true; | ||||||
| #else | #else | ||||||
|     const bool use_f16_soft_max = false; |     const bool use_f16_soft_max = false; | ||||||
| #endif // GGML_CUDA_F16 | #endif // GGML_CUDA_F16 | ||||||
| #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) | #else | ||||||
|  |     const bool use_f16_soft_max = false; | ||||||
|  | #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX | ||||||
|  |  | ||||||
|     if (use_f16_soft_max) { |     if (use_f16_soft_max) { | ||||||
|         soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream); |         soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Johannes Gäßler
					Johannes Gäßler