CUDA: mul_mat_v support for batch sizes > 1 (#14262)

* CUDA: mul_mat_v support for batch sizes > 1 * use 64 bit math for initial offset calculation
2025-11-05 09:36:52 +00:00 · 2025-06-23 13:11:31 +02:00
parent 7b50d589a8
commit defe2158dd
4 changed files with 256 additions and 103 deletions
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -262,6 +262,10 @@ static bool fp16_mma_hardware_available(const int cc) {
        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
 }

+static bool bf16_mma_hardware_available(const int cc) {
+    return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
+}
+
 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 static bool new_mma_available(const int cc) {
    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;