imatrix : use GGUF to store imatrix data

2025-11-03 09:22:01 +00:00 · 2024-09-06 17:17:25 -04:00
parent 347247a24e
commit 3de9300c37
4 changed files with 352 additions and 149 deletions
--- a/convert_legacy_imatrix_to_gguf.py
+++ b/convert_legacy_imatrix_to_gguf.py
@@ -0,0 +1,118 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import os
 import sys
 import logging
 import argparse
 from typing import Any
 from pathlib import Path
 from dataclasses import dataclass
 import numpy as np
 import numpy.typing as npt
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 logger = logging.getLogger("imatrix-to-gguf")
 class IMatrixWriter(gguf.GGUFWriter):
    def add_architecture(self) -> None:
        # no arch is stored in imatrix files
        pass
@dataclass
 class IMatrixEntry:
    values: np.ndarray[Any, np.dtype[np.float32]]
    counts: np.ndarray[Any, np.dtype[np.float32]]
 class IMatrixReader:
    chunk_size: int = 512  # guess
    offset: int = 0
    data: np.ndarray[Any, np.dtype[np.uint8]]
    n_enties: int
    entries: dict[str, IMatrixEntry]
    chunk_count: int
    dataset: str
    def _get(self, dtype: npt.DTypeLike, count: int = 1) -> npt.NDArray[Any]:
        count = int(count)
        itemsize = int(np.empty([], dtype=dtype).itemsize)
        offset = self.offset
        self.offset = offset + itemsize * count
        return self.data[offset:self.offset].view(dtype=dtype)[:count]
    def __init__(self, imatrix: Path):
        self.offset = 0
        self.entries = {}
        self.data = np.memmap(imatrix)
        n_entries = self._get(np.int32).item()
        assert n_entries >= 0
        for _ in range(n_entries):
            len = self._get(np.int32).item()
            name = self._get(np.uint8, len).tobytes().decode("utf-8")
            ncall = self._get(np.int32).item()
            nval = self._get(np.int32).item()
            data = self._get(np.float32, nval)
            assert name not in self.entries, f"duplicated name: {name!r}"
            self.entries[name] = IMatrixEntry(data, np.array([ncall * self.chunk_size], dtype=np.float32))
        self.chunk_count = self._get(np.int32).item()
        self.dataset = self._get(np.uint8, self._get(np.int32).item()).tobytes().decode("utf-8")
    def to_writer(self, outfile: Path) -> IMatrixWriter:
        writer = IMatrixWriter(path=outfile, arch="")
        writer.add_type(gguf.GGUFType.IMATRIX)
        writer.add_key_value(gguf.Keys.IMatrix.CHUNK_COUNT, self.chunk_count, gguf.GGUFValueType.UINT32)
        writer.add_key_value(gguf.Keys.IMatrix.CHUNK_SIZE, self.chunk_size, gguf.GGUFValueType.UINT32)
        writer.add_key_value(gguf.Keys.IMatrix.DATASET, self.dataset, gguf.GGUFValueType.STRING)
        for name, entry in self.entries.items():
            writer.add_tensor(name + ".sums", entry.values)
            writer.add_tensor(name + ".counts", entry.counts)
        return writer
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Convert an old imatrix.dat file to a GGUF compatible file")
    parser.add_argument(
        "--outfile", type=Path,
        help="path to write to; default: based on input.",
    )
    parser.add_argument(
        "--verbose", action="store_true",
        help="increase output verbosity",
    )
    parser.add_argument(
        "imatrix", type=Path,
        help="path to an imatrix file",
    )
    return parser.parse_args()
 if __name__ == "__main__":
    args = parse_args()
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
    if args.outfile is None:
        input_file: Path = args.imatrix
        if input_file.suffix != ".gguf":
            args.outfile = input_file.with_suffix(".gguf")
    writer = IMatrixReader(args.imatrix).to_writer(args.outfile)
    writer.write_header_to_file(args.outfile)
    writer.write_kv_data_to_file()
    writer.write_tensors_to_file()
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -5,11 +5,9 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <sstream>
 #include <thread>
 #include <mutex>
 #include <vector>
 #include <fstream>
 #include <unordered_map>
 #include <algorithm>
@@ -22,16 +20,19 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] [--verbosity 1] \\\n"
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
-            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
+            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...]\n" , argv[0]);
    LOG_TEE("\n");
 }
 static const char * const LLM_KV_IMATRIX_DATASET     = "imatrix.dataset";
 static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
 static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
 struct Stats {
-    std::vector<float> values;
+    std::vector<double> values;
-    std::vector<int> counts;
+    std::vector<int64_t> counts;
    int ncall = 0;
 };
 class IMatrixCollector {
@@ -39,13 +40,13 @@ public:
    IMatrixCollector() = default;
    void set_params(gpt_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix(int ncall = -1) const;
+    void save_imatrix(int32_t n_chunk = -1) const;
    bool load_imatrix(const char * file_name);
 private:
    std::unordered_map<std::string, Stats> m_stats;
    gpt_params                             m_params;
    std::mutex                             m_mutex;
-    int                                    m_last_call = 0;
+    int32_t                                m_last_chunk = 0;
    std::vector<float>                     m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
 };
@@ -119,18 +120,24 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        auto & e = m_stats[wname];
-        ++e.ncall;
+        if (e.counts.size() == 1 && n_as > 1) {
-
+            // broadcast, when loading an old imatrix
            e.counts.resize(n_as, e.counts[0]);
        }
        if (e.values.empty()) {
            e.values.resize(src1->ne[0]*n_as, 0);
-            e.counts.resize(src1->ne[0]*n_as, 0);
+            e.counts.resize(n_as, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
            exit(1); //GGML_ABORT("fatal error");
        }
        else if (e.counts.size() != (size_t)n_as) {
            fprintf(stderr, "Oops: inconsistent expert count for %s (%d vs %d)\n", wname.c_str(), (int)e.counts.size(), (int)n_as);
            exit(1); //GGML_ABORT("fatal error");
        }
        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
        }
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
@@ -148,23 +155,26 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                    const int64_t i12 = row;
                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
                    e.counts[ex]++;
                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
                        e.values[e_start + j] += x[j]*x[j];
-                        e.counts[e_start + j]++;
+                        if (!std::isfinite((float)e.values[e_start + j])) {
-                        if (!std::isfinite(e.values[e_start + j])) {
+                            fprintf(stderr, "%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
                }
            }
-            if (e.ncall > m_last_call) {
+            const int32_t n_chunk = e.counts[ex] / (m_params.n_ctx / m_params.n_parallel);
-                m_last_call = e.ncall;
+            if (n_chunk > m_last_chunk) {
-                if (m_last_call % m_params.n_out_freq == 0) {
+                const int32_t chunk_step = n_chunk - m_last_chunk;
                m_last_chunk = n_chunk;
                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
                    save_imatrix();
                }
-                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
-                    save_imatrix(m_last_call);
+                    save_imatrix(m_last_chunk);
                }
            }
        }
@@ -172,34 +182,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        auto & e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
-            e.counts.resize(src1->ne[0], 0);
+            e.counts.resize(1, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
            exit(1); //GGML_ABORT("fatal error");
        }
-        ++e.ncall;
+        else if (e.counts.size() != 1) {
-        if (m_params.verbosity > 1) {
+            fprintf(stderr, "Oops: inconsistent expert count for %s (%d vs %d)\n", wname.c_str(), (int)e.counts.size(), 1);
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            exit(1); //GGML_ABORT("fatal error");
        }
        if (m_params.verbosity > 1) {
            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
        }
        // TODO: higher dimensions
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
            const float * x = data + row * src1->ne[0];
            e.counts[0]++;
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
-                e.counts[j]++;
+                if (!std::isfinite((float)e.values[j])) {
-                if (!std::isfinite(e.values[j])) {
+                    fprintf(stderr, "%f detected in %s\n", (float)e.values[j], wname.c_str());
                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
                    exit(1);
                }
            }
        }
-        if (e.ncall > m_last_call) {
+        const int32_t n_chunk = e.counts[0] / (m_params.n_ctx / m_params.n_parallel);
-            m_last_call = e.ncall;
+        if (n_chunk > m_last_chunk) {
-            if (m_last_call % m_params.n_out_freq == 0) {
+            const int32_t chunk_step = n_chunk - m_last_chunk;
            m_last_chunk = n_chunk;
            if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
                save_imatrix();
            }
-            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+            if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
-                save_imatrix(m_last_call);
+                save_imatrix(m_last_chunk);
            }
        }
    }
@@ -207,15 +223,15 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    return true;
 }
-void IMatrixCollector::save_imatrix(int ncall) const {
+void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
    auto fname = m_params.out_file;
    if (fname.empty()) {
-        fname = "imatrix.dat";
+        fname = "imatrix.gguf";
    }
-    if (ncall > 0) {
+    if (n_chunk > 0) {
        fname += ".at_";
-        fname += std::to_string(ncall);
+        fname += std::to_string(n_chunk);
    }
    // avoid writing imatrix entries that do not have full data
@@ -223,6 +239,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
    int n_entries = 0;
    std::vector<std::string> to_store;
    size_t data_size = 0;
    bool is_first = true; // for printing
    for (const auto & kv : m_stats) {
@@ -256,100 +273,132 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        n_entries++;
        to_store.push_back(kv.first);
        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
    }
    if (to_store.size() < m_stats.size()) {
        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
    }
-    std::ofstream out(fname, std::ios::binary);
+    struct ggml_init_params params = {
-    out.write((const char *) &n_entries, sizeof(n_entries));
+        .mem_size   = data_size,
        .mem_buffer = NULL,
        .no_alloc   = false,
    };
    struct ggml_context * ctx = ggml_init(params);
    struct gguf_context * ctx_gguf = gguf_init_empty();
    gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
    // Write the input filename to later on specify it in quantize
    gguf_set_val_str(ctx_gguf, LLM_KV_IMATRIX_DATASET, m_params.prompt_file.c_str());
    // Write the number of chunks the matrix was computed with
    gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
    gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
    for (const auto & name : to_store) {
        const auto & stat = m_stats.at(name);
-        int len = name.size();
+        const int32_t nval = (int32_t) stat.values.size();
-        out.write((const char *) &len, sizeof(len));
+        const int32_t nmat = (int32_t) stat.counts.size();
        out.write(name.c_str(), len);
        out.write((const char *) &stat.ncall, sizeof(stat.ncall));
        int nval = stat.values.size();
        out.write((const char *) &nval, sizeof(nval));
        if (nval > 0) {
-            std::vector<float> tmp(nval);
+            struct ggml_tensor * sums = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
-            for (int i = 0; i < nval; i++) {
+            struct ggml_tensor * counts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
-                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
+            ggml_set_name(sums, (name + ".sums").c_str());
            ggml_set_name(counts, (name + ".counts").c_str());
            for (int32_t j = 0; j < nval; ++j) {
                ((float *) sums->data)[j] = (float) stat.values[j];
            }
-            out.write((const char*)tmp.data(), nval*sizeof(float));
+            for (int32_t j = 0; j < nmat; ++j) {
                ((float *) counts->data)[j] = (float) stat.counts[j];
            }
            gguf_add_tensor(ctx_gguf, sums);
            gguf_add_tensor(ctx_gguf, counts);
        }
    }
-    // Write the number of call the matrix was computed with
+    gguf_write_to_file(ctx_gguf, fname.c_str(), false);
    out.write((const char *) &m_last_call, sizeof(m_last_call));
    // Write the input filename at the end of the file to later on specify it in quantize
    {
        int len = m_params.prompt_file.size();
        out.write((const char *) &len, sizeof(len));
        out.write(m_params.prompt_file.c_str(), len);
    }
    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
    }
    gguf_free(ctx_gguf);
    ggml_free(ctx);
 }
-bool IMatrixCollector::load_imatrix(const char * fname) {
+bool IMatrixCollector::load_imatrix(const char * file_name) {
-    std::ifstream in(fname, std::ios::binary);
+    struct ggml_context * ctx = nullptr;
-    if (!in) {
+    struct gguf_init_params meta_gguf_params = {
-        printf("%s: failed to open %s\n",__func__, fname);
+        /* .no_alloc = */ false, // the data is needed
        /* .ctx      = */ &ctx,
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
    if (!ctx_gguf) {
        return false;
    }
-    int n_entries;
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
-    in.read((char*)&n_entries, sizeof(n_entries));
+    if (n_entries < 2) {
-    if (in.fail() || n_entries < 1) {
+        fprintf(stderr, "%s: no data in file %s\n", __func__, file_name);
-        printf("%s: no data in file %s\n", __func__, fname);
+        gguf_free(ctx_gguf);
-        return false;
+        ggml_free(ctx);
    }
    for (int i = 0; i < n_entries; ++i) {
        int len; in.read((char *)&len, sizeof(len));
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
            return false;
        }
        name_as_vec[len] = 0;
        std::string name{name_as_vec.data()};
        auto & e = m_stats[std::move(name)];
        int ncall;
        in.read((char*)&ncall, sizeof(ncall));
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
            printf("%s: failed reading number of values for entry %d\n",__func__,i);
            m_stats = {};
        return false;
    }
    const std::string sums_suffix{".sums"};
    const std::string counts_suffix{".counts"};
    // TODO: allow loading from mis-ordered imatrix files
    for (int32_t i = 0; i < n_entries - 1; i += 2) {
        std::string sums_name{gguf_get_tensor_name(ctx_gguf, i + 0)};
        std::string counts_name{gguf_get_tensor_name(ctx_gguf, i + 1)};
        if (sums_name.size() < sums_suffix.size() ||
            counts_name.size() < counts_suffix.size() ||
            !std::equal(sums_name.begin(), sums_name.end() - sums_suffix.size(), counts_name.begin()) ||
            !std::equal(sums_suffix.rbegin(), sums_suffix.rend(), sums_name.rbegin()) ||
            !std::equal(counts_suffix.rbegin(), counts_suffix.rend(), counts_name.rbegin())) {
            fprintf(stderr, "%s: mismatched sums and counts for entry %d\n", __func__, i / 2);
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            return false;
        }
        struct ggml_tensor * sums = ggml_get_tensor(ctx, sums_name.c_str());
        struct ggml_tensor * counts = ggml_get_tensor(ctx, counts_name.c_str());
        if (!sums || !counts) {
            fprintf(stderr, "%s: failed reading data for entry %d\n", __func__, i / 2);
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            return false;
        }
        std::string name = sums_name.substr(0, sums_name.size() - sums_suffix.size());
        auto & e = m_stats[name];
        int32_t nval = ggml_nelements(sums);
        if (e.values.empty()) {
            e.values.resize(nval, 0);
-            e.counts.resize(nval, 0);
+        }
        int32_t ncounts = ggml_nelements(counts);
        if (e.counts.empty()) {
            e.counts.resize(ncounts, 0);
        } else if (e.counts.size() == 1 && ncounts > 1) {
            // broadcast, when loading an old imatrix
            e.counts.resize(ncounts, e.counts[0]);
        }
-        std::vector<float> tmp(nval);
+        // Recreate the state as expected by save_imatrix()
-        in.read((char*)tmp.data(), nval*sizeof(float));
+        for (int32_t j = 0; j < nval; j++) {
-        if (in.fail()) {
+            e.values[j] += ((const float *) sums->data)[j];
            printf("%s: failed reading data for entry %d\n",__func__,i);
            m_stats = {};
            return false;
        }
-
+        for (int32_t j = 0; j < ncounts; j++) {
-        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
+            e.counts[j] += std::lround(((const float *) counts->data)[j]);
        for (int i = 0; i < nval; i++) {
            e.values[i] += tmp[i];
            e.counts[i] += ncall;
        }
        e.ncall += ncall;
    }
    gguf_free(ctx_gguf);
    ggml_free(ctx);
    return true;
 }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -6,8 +6,6 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
 struct quant_option {
    std::string name;
@@ -61,6 +59,11 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
 // TODO: share with imatrix.cpp
 static const char * const LLM_KV_IMATRIX_DATASET     = "imatrix.dataset";
 static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
 static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;
@@ -121,66 +124,92 @@ static void usage(const char * executable) {
 }
 static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
-    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
+
-    if (!in) {
+    struct ggml_context * ctx = nullptr;
-        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
+    struct gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ false, // the data is needed
        /* .ctx      = */ &ctx,
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
    if (!ctx_gguf) {
        exit(1);
    }
-    int n_entries;
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
-    in.read((char *)&n_entries, sizeof(n_entries));
+    if (n_entries < 2) {
-    if (in.fail() || n_entries < 1) {
+        fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str());
-        printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
+        gguf_free(ctx_gguf);
        ggml_free(ctx);
        exit(1);
    }
-    for (int i = 0; i < n_entries; ++i) {
+
-        int len; in.read((char *)&len, sizeof(len));
+    const int dataset_idx     = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASET);
-        std::vector<char> name_as_vec(len+1);
+    const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT);
-        in.read((char *)name_as_vec.data(), len);
+    const int chunk_size_idx  = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE);
-        if (in.fail()) {
+    if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) {
-            printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
+        fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str());
        gguf_free(ctx_gguf);
        ggml_free(ctx);
        exit(1);
    }
-        name_as_vec[len] = 0;
+
-        std::string name{name_as_vec.data()};
+    const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
    const std::string sums_suffix{".sums"};
    const std::string counts_suffix{".counts"};
    // TODO: allow loading from mis-ordered imatrix files
    for (int32_t i = 0; i < n_entries - 1; i += 2) {
        std::string sums_name{gguf_get_tensor_name(ctx_gguf, i + 0)};
        std::string counts_name{gguf_get_tensor_name(ctx_gguf, i + 1)};
        if (sums_name.size() < sums_suffix.size() ||
            counts_name.size() < counts_suffix.size() ||
            !std::equal(sums_name.begin(), sums_name.end() - sums_suffix.size(), counts_name.begin()) ||
            !std::equal(sums_suffix.rbegin(), sums_suffix.rend(), sums_name.rbegin()) ||
            !std::equal(counts_suffix.rbegin(), counts_suffix.rend(), counts_name.rbegin())) {
            fprintf(stderr, "%s: mismatched sums and counts for entry %d\n", __func__, i / 2);
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            exit(1);
        }
        struct ggml_tensor * sums = ggml_get_tensor(ctx, sums_name.c_str());
        struct ggml_tensor * counts = ggml_get_tensor(ctx, counts_name.c_str());
        if (!sums || !counts) {
            fprintf(stderr, "%s: failed reading data for entry %d\n", __func__, i / 2);
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            exit(1);
        }
        const int64_t ne0 = sums->ne[0];
        const int64_t ne1 = sums->ne[1];
        std::string name = sums_name.substr(0, sums_name.size() - sums_suffix.size());
        auto & e = imatrix_data[name];
-        int ncall;
+        e.resize(ggml_nelements(sums));
-        in.read((char *)&ncall, sizeof(ncall));
+        float max_count = 0.0f;
-        int nval;
+        for (int64_t j = 0; j < ne1; ++j) {
-        in.read((char *)&nval, sizeof(nval));
+            const float count = ((const float *) counts->data)[ne1];
-        if (in.fail() || nval < 1) {
+            for (int64_t i = 0; i < ne0; ++i) {
-            printf("%s: failed reading number of values for entry %d\n", __func__, i);
+                e[ne1*ne0 + ne0] = ((const float *) sums->data)[ne1*ne0 + ne0] / count;
            imatrix_data = {};
            exit(1);
            }
-        e.resize(nval);
+            if (count > max_count) {
-        in.read((char *)e.data(), nval*sizeof(float));
+                max_count = count;
        if (in.fail()) {
            printf("%s: failed reading data for entry %d\n", __func__, i);
            imatrix_data = {};
            exit(1);
            }
        if (ncall > 0) {
            for (auto& v : e) v /= ncall;
        }
        if (getenv("LLAMA_TRACE")) {
-            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
+            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), int(max_count / chunk_size), name.c_str());
        }
    }
    gguf_free(ctx_gguf);
    ggml_free(ctx);
    int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx);
    imatrix_dataset = gguf_get_val_str(ctx_gguf, dataset_idx);
    // latest imatrix version contains the dataset filename at the end of the file
    int m_last_call = 0;
    if (in.peek() != EOF) {
        in.read((char *)&m_last_call, sizeof(m_last_call));
        int dataset_len;
        in.read((char *)&dataset_len, sizeof(dataset_len));
        std::vector<char> dataset_as_vec(dataset_len);
        in.read(dataset_as_vec.data(), dataset_len);
        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
    printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
-    }
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
-    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
+    return m_last_chunk;
    return m_last_call;
 }
 static int prepare_imatrix(const std::string & imatrix_file,
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -167,6 +167,12 @@ class Keys:
        TYPE       = "adapter.type"
        LORA_ALPHA = "adapter.lora.alpha"
    class IMatrix:
        CHUNK_COUNT = "imatrix.chunk_count"
        CHUNK_SIZE  = "imatrix.chunk_size"
        DATASET     = "imatrix.dataset"
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@@ -175,6 +181,7 @@ class Keys:
 class GGUFType:
    MODEL   = "model"
    ADAPTER = "adapter"
    IMATRIX = "imatrix"
 class MODEL_ARCH(IntEnum):