mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	HIP: Add support for RDNA4 targets (#12372)
This commit is contained in:
		@@ -52,7 +52,7 @@
 | 
			
		||||
#define GGML_CUDA_CC_IS_NVIDIA(cc)   (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
 | 
			
		||||
 | 
			
		||||
// AMD
 | 
			
		||||
// GCN/CNDA, wave size is 64
 | 
			
		||||
// GCN/CDNA, wave size is 64
 | 
			
		||||
#define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 0x803)  // Tonga, Fiji, Polaris, minimum for fast fp16
 | 
			
		||||
#define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 0x900)  // Vega56/64, minimum for fp16 dual issue
 | 
			
		||||
#define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 0x906)  // MI50/Radeon VII, minimum for dp4a
 | 
			
		||||
@@ -60,16 +60,18 @@
 | 
			
		||||
#define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
 | 
			
		||||
#define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300
 | 
			
		||||
 | 
			
		||||
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
 | 
			
		||||
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
 | 
			
		||||
#define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 | 
			
		||||
#define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 | 
			
		||||
#define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
 | 
			
		||||
#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
 | 
			
		||||
 | 
			
		||||
#define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
 | 
			
		||||
#define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
 | 
			
		||||
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
 | 
			
		||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
 | 
			
		||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
 | 
			
		||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
 | 
			
		||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
 | 
			
		||||
#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
 | 
			
		||||
#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
 | 
			
		||||
 | 
			
		||||
@@ -209,9 +211,9 @@ typedef float2 dfloat2;
 | 
			
		||||
#define FP16_MMA_AVAILABLE
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
 | 
			
		||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
 | 
			
		||||
#define FP16_MMA_AVAILABLE
 | 
			
		||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
 | 
			
		||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
 | 
			
		||||
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 | 
			
		||||
#define NEW_MMA_AVAILABLE
 | 
			
		||||
@@ -244,14 +246,14 @@ static bool fp16_mma_available(const int cc) {
 | 
			
		||||
    return false;
 | 
			
		||||
#else
 | 
			
		||||
    return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
 | 
			
		||||
        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
 | 
			
		||||
        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
 | 
			
		||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
 | 
			
		||||
static bool fp16_mma_hardware_available(const int cc) {
 | 
			
		||||
    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
 | 
			
		||||
        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
 | 
			
		||||
        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 | 
			
		||||
@@ -409,7 +411,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 | 
			
		||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
 | 
			
		||||
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 | 
			
		||||
#elif defined(RDNA3)
 | 
			
		||||
#elif defined(RDNA3) || defined(RDNA4)
 | 
			
		||||
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 | 
			
		||||
#elif defined(RDNA1) || defined(__gfx900__)
 | 
			
		||||
    int tmp1;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user