mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	CUDA: Fix clang warnings (#12540)
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
This commit is contained in:
		| @@ -243,14 +243,14 @@ static bool fp16_mma_available(const int cc) { | |||||||
| #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) | #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) | ||||||
|     return false; |     return false; | ||||||
| #else | #else | ||||||
|     return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA || |     return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || | ||||||
|         GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |         GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); | ||||||
| #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) | #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) | ||||||
| } | } | ||||||
|  |  | ||||||
| // To be used for feature selection of external libraries, e.g. cuBLAS. | // To be used for feature selection of external libraries, e.g. cuBLAS. | ||||||
| static bool fp16_mma_hardware_available(const int cc) { | static bool fp16_mma_hardware_available(const int cc) { | ||||||
|     return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA || |     return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || | ||||||
|         GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |         GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1192,7 +1192,7 @@ static void ggml_cuda_op_mul_mat_cublas( | |||||||
|  |  | ||||||
|     const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT; |     const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT; | ||||||
|  |  | ||||||
|     if (((cc >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) { |     if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) { | ||||||
|         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 |         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 | ||||||
|         ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id)); |         ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id)); | ||||||
|         if (src0->type != GGML_TYPE_F16) { |         if (src0->type != GGML_TYPE_F16) { | ||||||
|   | |||||||
| @@ -27,8 +27,8 @@ void ggml_cuda_op_mul_mat_q( | |||||||
|     // The stream-k decomposition is only faster for recent NVIDIA GPUs. |     // The stream-k decomposition is only faster for recent NVIDIA GPUs. | ||||||
|     // Also its fixup needs to allocate a temporary buffer in the memory pool. |     // Also its fixup needs to allocate a temporary buffer in the memory pool. | ||||||
|     // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. |     // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. | ||||||
|     const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && |     const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && | ||||||
|         GGML_CUDA_CC_IS_NVIDIA(cc) && src1_ncols == ne11; |         ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; | ||||||
|     const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; |     const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; | ||||||
|  |  | ||||||
|     switch (src0->type) { |     switch (src0->type) { | ||||||
|   | |||||||
| @@ -90,7 +90,7 @@ struct tile_x_sizes { | |||||||
|  |  | ||||||
| static int get_mmq_x_max_host(const int cc) { | static int get_mmq_x_max_host(const int cc) { | ||||||
|     return new_mma_available(cc) ? 128 : |     return new_mma_available(cc) ? 128 : | ||||||
|         ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc) ? |         GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? | ||||||
| #ifdef GGML_CUDA_FORCE_MMQ | #ifdef GGML_CUDA_FORCE_MMQ | ||||||
|             128                     : 64; |             128                     : 64; | ||||||
| #else | #else | ||||||
| @@ -124,7 +124,7 @@ static constexpr __device__ int get_mmq_x_max_device() { | |||||||
|  |  | ||||||
| static int get_mmq_y_host(const int cc) { | static int get_mmq_y_host(const int cc) { | ||||||
|     return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : |     return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : | ||||||
|         ((ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) ? 128 : 64); |         ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64); | ||||||
| } | } | ||||||
|  |  | ||||||
| static constexpr __device__ int get_mmq_y_device() { | static constexpr __device__ int get_mmq_y_device() { | ||||||
| @@ -2832,7 +2832,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda | |||||||
|     const int mmq_x_max = get_mmq_x_max_host(cc); |     const int mmq_x_max = get_mmq_x_max_host(cc); | ||||||
|     const int mmq_y = get_mmq_y_host(cc); |     const int mmq_y = get_mmq_y_host(cc); | ||||||
|     const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; |     const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; | ||||||
|     const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc); |     const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; | ||||||
|  |  | ||||||
|     int mmq_x_best  = 0; |     int mmq_x_best  = 0; | ||||||
|     int nparts_best = INT_MAX; |     int nparts_best = INT_MAX; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 R0CKSTAR
					R0CKSTAR