CUDA: optimize and refactor MMQ (#8416)

* CUDA: optimize and refactor MMQ * explicit q8_1 memory layouts, add documentation
2025-11-11 10:36:54 +00:00 · 2024-07-11 16:47:47 +02:00
parent a977c11544
commit 808aba3916
5 changed files with 867 additions and 687 deletions
--- a/ggml/src/ggml-cuda/quantize.cuh
+++ b/ggml/src/ggml-cuda/quantize.cuh
@@ -5,7 +5,11 @@

 #include <cstdint>

-#define CUDA_QUANTIZE_BLOCK_SIZE 256
+#define CUDA_QUANTIZE_BLOCK_SIZE     256
+#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
+
+static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
+static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");

 typedef void (*quantize_cuda_t)(
    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,