CUDA: skip masked KV slices for all FA kernels (#14924)

2025-11-14 11:07:10 +00:00 · 2025-07-30 15:46:13 +02:00
parent 00131d6eaf
commit 92b8810ec7
9 changed files with 120 additions and 56 deletions
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -432,6 +432,20 @@ static __global__ void reduce_rows_f32(const float * x, float * dst, const int n
    dst[row] = norm ? sum / ncols : sum;
 }

+template<int width = WARP_SIZE>
+static __device__ __forceinline__ int warp_reduce_all(int x) {
+#ifdef GGML_USE_HIP
+#pragma unroll
+    for (int offset = width/2; offset > 0; offset >>= 1) {
+        x = x && __shfl_xor_sync(0xffffffff, x, offset, width);
+    }
+    return x;
+#else
+    static_assert(width == WARP_SIZE, "width != WARP_SIZE not implemented");
+    return __all_sync(0xffffffff, x);
+#endif // GGML_USE_HIP
+}
+
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll