CUDA: faster q2_K, q3_K MMQ + int8 tensor cores (#7921)

* CUDA: faster q2_K, q3_K MMQ + int8 tensor cores * try CI fix * try CI fix * try CI fix * fix data race * rever q2_K precision related changes
2025-11-19 11:57:07 +00:00 · 2024-06-14 18:41:49 +02:00
parent 66ef1ceedf
commit 76d66ee0be
6 changed files with 468 additions and 330 deletions
--- a/ggml-cuda/softmax.cu
+++ b/ggml-cuda/softmax.cu
@@ -130,6 +130,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

+    // FIXME: this limit could be raised by ~2-4x on Ampere or newer
    if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
        switch (ncols_x) {
            case 32: