Merge branch 'master' into compilade/imatrix-neutral-prior

2025-11-10 10:27:03 +00:00 · 2025-08-05 13:34:40 -04:00
parent 46a8601140 be42642581
commit ea5e55d03e
34 changed files with 1204 additions and 478 deletions
--- a/tools/imatrix/README.md
+++ b/tools/imatrix/README.md
@@ -7,7 +7,7 @@ More information is available in <https://github.com/ggml-org/llama.cpp/pull/486

 ```
 ./llama-imatrix \
-    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \
+    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \
    [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
    [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
    [--show-statistics] [...]
@@ -20,6 +20,7 @@ The parameters in square brackets are optional and have the following meaning:
 * `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
 * `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
 * `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf".
 * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
 * `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 * `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
@@ -45,14 +46,19 @@ Recent versions of `llama-imatrix` store data in GGUF format by default. For the

 ```bash
 # generate and save the imatrix using legacy format
-./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -o imatrix-legcy-format.dat -ngl 99
+./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --output-format dat -o imatrix-legcy-format.dat -ngl 99
 ```

 ```bash
-# covert legacy (binary) imatrix format to new (GGUF) format
+# convert legacy (binary) imatrix format to new (GGUF) format
 ./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf
 ```

+```bash
+# convert new (GGUF) imatrix format to legacy (binary) format
+./llama-imatrix --in-file imatrix-new-format.gguf --output-format dat -o imatrix-legacy-format.dat
+```
+
 ```bash
 # combine existing imatrices
 ./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -26,7 +26,7 @@
 static void print_usage(int, char ** argv) {
    LOG("\nexample usage:\n");
    LOG("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
            "       [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
            "       [--show-statistics] [...]\n" , argv[0]);
@@ -506,13 +506,17 @@ void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {

 void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
    auto fname = m_params.out_file;
+    int8_t use_legacy_format = m_params.imat_dat;

-    // TODO: use the new format in more cases
-    if (!string_ends_with(fname, ".gguf")) {
-        LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
+    if (use_legacy_format > 0) {
        this->save_imatrix_legacy(n_chunk);
        return;
    }
+    // only warn when `--output-format gguf` is not specified
+    if (use_legacy_format == 0 && !string_ends_with(fname, ".gguf")) {
+        LOG_WRN("\n%s: saving imatrix using GGUF format with a different suffix than .gguf\n", __func__);
+        LOG_WRN("%s: if you want the previous imatrix format, use --output-format dat\n", __func__);
+    }

    if (n_chunk > 0) {
        fname += ".at_";
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -641,7 +641,7 @@ int main(int argc, char ** argv) {
            return 1;
        }
        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
-            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
+            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[arg_idx]);
            return 1;
        }
        if (ftype_str == "COPY") {
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/webui/src/index.scss
+++ b/tools/server/webui/src/index.scss
@@ -31,7 +31,24 @@ html {
  hr {
    @apply my-4 border-base-content/20 border-1;
  }
-  /* TODO: fix markdown table */
+  table {
+    @apply w-full border-collapse text-sm font-sans my-4 text-base-content;
+  }
+  thead {
+    @apply bg-base-200 text-base-content;
+  }
+  th {
+    @apply border border-base-300 px-4 py-2 text-left font-semibold;
+  }
+  td {
+    @apply border border-base-300 px-4 py-2 align-top;
+  }
+  tbody tr:nth-child(even) {
+    @apply bg-base-100;
+  }
+  tbody tr:hover {
+    @apply bg-base-200;
+  }
 }

 .btn-mini {