Use full range for q4_0 quantization

By keeping the sign of the highest magnitude, we can make sure the highest value maps to -8, which is currently unused. This is a bit of a freebie since it is fully backwards compatible with the current format. quantize-stats output: before(7B): q4_0 : mse 0.00000492, maxerr 0.14257812 after(7B): q4_0 : mse 0.00000386, maxerr 0.18200684 (Most layers have reduced maxerr under this rule, but the total max error is indeed slightly higher)
2025-10-30 08:42:00 +00:00 · 2023-04-03 03:02:26 +02:00
parent 0e018fe008
commit 3698f79e6a
1 changed files with 8 additions and 4 deletions
--- a/ggml.c
+++ b/ggml.c
@@ -680,13 +680,17 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r

    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
+        float max = 0.0f;

        for (int l = 0; l < QK4_0; l++) {
            const float v = x[i*QK4_0 + l];
-            amax = MAX(amax, fabsf(v));
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
        }

-        const float d = amax / ((1 << 3) - 1);
+        const float d = max / -8;
        const float id = d ? 1.0f/d : 0.0f;

        y[i].d = d;
@@ -695,8 +699,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
            const float v0 = x[i*QK4_0 + l + 0]*id;
            const float v1 = x[i*QK4_0 + l + 1]*id;

-            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
-            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+            const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8);
+            const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8);

            assert(vi0 < 16);
            assert(vi1 < 16);