CUDA/HIP: optimize mmv paths taken for HIP devices (#14324)

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2025-11-05 09:36:52 +00:00 · 2025-06-24 01:12:56 +02:00
parent ce82bd0117
commit 0142961a2e
2 changed files with 23 additions and 1 deletions
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -263,7 +263,11 @@ static bool fp16_mma_hardware_available(const int cc) {
 }

 static bool bf16_mma_hardware_available(const int cc) {
-    return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
+}
+
+static bool fp32_mma_hardware_available(const int cc) {
+    return GGML_CUDA_CC_IS_CDNA(cc);
 }

 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.