gguf : add Python script to convert GGMLv3 LLaMA models to GGUF (#2682)

* First pass at converting GGMLv3 LLaMA models to GGUF * Cleanups, better output during conversion * Fix vocab space conversion logic * More vocab conversion fixes * Add description to converted GGUF files * Improve help text, expand warning * Allow specifying name and description for output GGUF * Allow overriding vocab and hyperparams from original model metadata * Use correct params override var name * Fix wrong type size for Q8_K Better handling of original style metadata * Set default value for gguf add_tensor raw_shape KW arg
2025-10-31 08:51:55 +00:00 · 2023-08-21 08:45:52 -06:00
parent 6490ff7198
commit e06cbcee73
2 changed files with 374 additions and 12 deletions
--- a/gguf.py
+++ b/gguf.py
@@ -5,7 +5,7 @@ import tempfile
 import numpy as np

 from enum import IntEnum, auto
-from typing import Any, IO, List
+from typing import Any, IO, List, Optional

 #
 # constants
@@ -325,8 +325,20 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:


 class GGMLQuantizationType(IntEnum):
-    F32 = 0
-    F16 = 1
+    F32  = 0
+    F16  = 1
+    Q4_0 = 2
+    Q4_1 = 3
+    Q5_0 = 6
+    Q5_1 = 7
+    Q8_0 = 8
+    Q8_1 = 9
+    Q2_K = 10
+    Q3_K = 11
+    Q4_K = 12
+    Q5_K = 13
+    Q6_K = 14
+    Q8_K = 15


 class GGUFValueType(IntEnum):
@@ -359,7 +371,7 @@ class GGUFValueType(IntEnum):


 class GGUFWriter:
-    def __init__(self, path: str, arch: str):
+    def __init__(self, path: str, arch: str, use_temp_file = True):
        self.fout = open(path, "wb")
        self.arch = arch
        self.offset_tensor = 0
@@ -369,6 +381,8 @@ class GGUFWriter:
        self.ti_data = b""
        self.ti_data_count = 0
        self.add_architecture()
+        self.use_temp_file = use_temp_file
+        self.tensors = []

    def write_header_to_file(self):
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
@@ -476,8 +490,8 @@ class GGUFWriter:
    def ggml_pad(x: int, n: int) -> int:
        return ((x + n - 1) // n) * n

-    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
-        assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
+        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

        encoded_name = name.encode("utf8")
        self.ti_data += struct.pack("<I", len(encoded_name))
@@ -486,23 +500,30 @@ class GGUFWriter:
        self.ti_data += struct.pack("<I", n_dims)
        for i in range(n_dims):
            self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
-
-        dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        if raw_dtype is None:
+            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        else:
+            dtype = raw_dtype
        self.ti_data += struct.pack("<I", dtype)
        self.ti_data += struct.pack("<Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1

-    def add_tensor(self, name: str, tensor: np.ndarray):
-        if not hasattr(self, "temp_file"):
+    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
+        if self.use_temp_file and not hasattr(self, "temp_file"):
            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
            self.temp_file.seek(0)

-        self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
+        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+
+        if not self.use_temp_file:
+            self.tensors.append((tensor, pad))
+            return

        tensor.tofile(self.temp_file)

-        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
        if pad != 0:
            self.temp_file.write(bytes([0] * pad))

@@ -524,6 +545,13 @@ class GGUFWriter:
        if pad != 0:
            self.fout.write(bytes([0] * pad))

+        if not self.use_temp_file:
+            for (currtensor, currpad) in self.tensors:
+                currtensor.tofile(self.fout)
+                if currpad != 0:
+                    self.fout.write(bytes([0] * currpad))
+            return
+
        self.temp_file.seek(0)

        shutil.copyfileobj(self.temp_file, self.fout)