metal : add support for non-padded FA KV (#16148)

* metal : pad K, V and Mask when needed * cont : simplify * cuda : add TODO about KV padding requirement * metal : add comments * metal : remove mask padding requirement
2025-11-13 10:57:15 +00:00 · 2025-10-07 08:23:30 +03:00
parent 1d6092fc72
commit 0a319bb75e
9 changed files with 460 additions and 72 deletions
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -208,6 +208,12 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const

    const int cc = ggml_cuda_info().devices[device].cc;

+    // TODO: temporary until support is extended
+    //       https://github.com/ggml-org/llama.cpp/pull/16148#issuecomment-3343525206
+    if (K->ne[1] % FATTN_KQ_STRIDE != 0) {
+        return BEST_FATTN_KERNEL_NONE;
+    }
+
    switch (K->ne[0]) {
        case  64:
        case 128: