mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	CUDA: use arch list for compatibility check (#11775)
* CUDA: use arch list for feature availability check --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
This commit is contained in:
		@@ -71,6 +71,47 @@
 | 
			
		||||
#define GGML_CUDA_CC_QY1        210
 | 
			
		||||
#define GGML_CUDA_CC_QY2        220
 | 
			
		||||
 | 
			
		||||
#ifdef __CUDA_ARCH_LIST__
 | 
			
		||||
constexpr bool ggml_cuda_has_arch_impl(int) {
 | 
			
		||||
    return false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class ... Archs>
 | 
			
		||||
constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
 | 
			
		||||
    return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
constexpr bool ggml_cuda_has_arch(const int arch) {
 | 
			
		||||
    return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur) {
 | 
			
		||||
    if (cur == 0) {
 | 
			
		||||
        GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
 | 
			
		||||
    }
 | 
			
		||||
    return cur;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class ... Archs>
 | 
			
		||||
constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
 | 
			
		||||
    if (first <= arch && first > cur) {
 | 
			
		||||
        return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
 | 
			
		||||
    } else {
 | 
			
		||||
        return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
 | 
			
		||||
    return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__);
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
static int ggml_cuda_highest_compiled_arch(const int arch) {
 | 
			
		||||
    return arch;
 | 
			
		||||
}
 | 
			
		||||
#endif // __CUDA_ARCH_LIST__
 | 
			
		||||
 | 
			
		||||
// ---------------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 | 
			
		||||
 | 
			
		||||
#if defined(_MSC_VER)
 | 
			
		||||
@@ -162,18 +203,32 @@ typedef float2 dfloat2;
 | 
			
		||||
#define FLASH_ATTN_AVAILABLE
 | 
			
		||||
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 | 
			
		||||
 | 
			
		||||
static constexpr bool fast_fp16_available(const int cc) {
 | 
			
		||||
static bool fp16_available(const int cc) {
 | 
			
		||||
    return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool fast_fp16_available(const int cc) {
 | 
			
		||||
    return fp16_available(cc) && cc != 610;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
 | 
			
		||||
static bool fast_fp16_hardware_available(const int cc) {
 | 
			
		||||
    return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Any FP16 tensor cores are available.
 | 
			
		||||
static constexpr bool fp16_mma_available(const int cc) {
 | 
			
		||||
// Any FP16 tensor core instructions are available for ggml code.
 | 
			
		||||
static bool fp16_mma_available(const int cc) {
 | 
			
		||||
    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
 | 
			
		||||
static bool fp16_mma_hardware_available(const int cc) {
 | 
			
		||||
    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 | 
			
		||||
static constexpr bool new_mma_available(const int cc) {
 | 
			
		||||
    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
 | 
			
		||||
static bool new_mma_available(const int cc) {
 | 
			
		||||
    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user