ggml : add Q8_0 quantization for intermediate results (#951)

* ggml : add Q8_0 quantization for intermediate results * quantize-stats : fix test + add it to Makefile default * Q8: use int8_t, AVX/AVX2 optimizations * ggml : fix quantize_row_q8_0() ARM_NEON rounding * minor : updates after rebase to latest master * quantize-stats : delete obsolete strings * ggml : fix q4_1 dot func --------- Co-authored-by: Stephan Walter <stephan@walter.name>
2025-11-01 09:01:57 +00:00 · 2023-04-15 17:53:22 +03:00
parent aa485cee33
commit e95b6554b4
3 changed files with 442 additions and 18 deletions
--- a/ggml.h
+++ b/ggml.h
@@ -204,6 +204,7 @@ enum ggml_type {
    GGML_TYPE_F16  = 1,
    GGML_TYPE_Q4_0 = 2,
    GGML_TYPE_Q4_1 = 3,
+    GGML_TYPE_Q8_0 = 4,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@@ -836,6 +837,7 @@ typedef struct {
    dequantize_row_q_t dequantize_row_q;
    quantize_row_q_t   quantize_row_q;
    quantize_row_q_t   quantize_row_q_reference;
+    quantize_row_q_t   quantize_row_q_dot;
    vec_dot_q_t        vec_dot_q;
 } quantize_fns_t;