sycl: Add reorder to Q6_K mmvq implementation (#13885)

* Add Reorder to Q6_K mmvq implementation * Address PR comments: clean up comments * Remove unused parameter after refactoring q4_k * Adding inline to function and removing unnecessary reference to int --------- Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
2025-11-13 10:57:15 +00:00 · 2025-06-09 11:47:07 +02:00
parent 91a8ee6a6f
commit b460d16ae8
6 changed files with 244 additions and 30 deletions
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@@ -14,12 +14,13 @@
 #ifndef GGML_SYCL_QUANTS_HPP
 #define GGML_SYCL_QUANTS_HPP

+#include <utility>
+
 #include "ggml-common.h"
 #include "ggml.h"

 namespace ggml_sycl_reordered {

-
 // The reordered block moves quants (qs) and  scales(d) to two
 // uniform regions of memory that is contiguous in the same tensor.
 // What this means is that instead of having:
@@ -32,7 +33,6 @@ namespace ggml_sycl_reordered {

 template <ggml_type type> struct block_q_t;

-
 // qk number of weights / quants in a block
 // qr number of weights in a byte (described as 'before dequantization')
 //    for quantization types that has low and high bits split, qr is calculated with
@@ -47,10 +47,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
        static constexpr uint32_t vdr_mmvq = 2;
    };

-    static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
+    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
+        return { block_index * (traits::qk / traits::qr), 0 };
+    }

-    static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
-        return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
+    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+        return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
    }

    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
@@ -64,20 +66,46 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
        static constexpr uint32_t vdr_mmvq = 2;
    };

-    static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
+    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
+        return { block_index * (traits::qk / traits::qr), 0 };
+    }

-    static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
+    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
        auto nblocks = (nrows * (ncols / traits::qk));
-        return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
+        return { nblocks * (QK_K / 2),
+                 (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
    }

    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }

    constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
-
-    constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
 };

+template <> struct block_q_t<GGML_TYPE_Q6_K> {
+    struct traits {
+        static constexpr uint32_t qk       = QK_K;
+        static constexpr uint32_t qi       = QI6_K;
+        static constexpr uint32_t qr       = QR6_K;
+        static constexpr uint32_t vdr_mmvq = 1;
+    };
+
+    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
+        auto low_bits_index  = block_index * (traits::qk / traits::qr);
+        // the index of high bits it's after all low bits
+        auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
+        return { low_bits_index, high_bits_index };
+    }
+
+    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+        auto nblocks        = (nrows * (ncols / traits::qk));
+        auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
+        auto block_scales   = total_qs_bytes + block_index * (QK_K / 16);
+        auto sb_scale       = total_qs_bytes + nblocks * (QK_K / 16);
+        return { block_scales, sb_scale };
+    }
+
+    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+};
 }  // namespace ggml_sycl_reordered

 #endif  // GGML_SYCL_QUANTS_HPP