RMSE-optimized quants for all quantization types

By default this new option is ON. One can turn it off by setting LLAMA_NO_RMSE. With this option enabled, the Q4_3 quantization results in a perplexity of 6.0344, so 0.0273 lower than simple Q4_3 quantization.
2025-11-20 12:07:33 +00:00 · 2023-04-21 10:26:49 +02:00
parent 0e018fe008
commit e435bfd93c
3 changed files with 286 additions and 80 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,6 +68,9 @@ option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"
 option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
 option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)

+# RMSE minimization when quantizing
+option(LLAMA_NO_RMSE                "llama: disable RMSE minimization"                      OFF)
+
 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})

@@ -99,6 +102,10 @@ if (NOT MSVC)
    endif()
 endif()

+if (LLAMA_NO_RMSE)
+    add_compile_definitions(GGML_NO_RMSE)
+endif()
+
 if (APPLE AND LLAMA_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    if (ACCELERATE_FRAMEWORK)