mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-31 08:51:55 +00:00
gguf : add Python script to convert GGMLv3 LLaMA models to GGUF (#2682)
* First pass at converting GGMLv3 LLaMA models to GGUF * Cleanups, better output during conversion * Fix vocab space conversion logic * More vocab conversion fixes * Add description to converted GGUF files * Improve help text, expand warning * Allow specifying name and description for output GGUF * Allow overriding vocab and hyperparams from original model metadata * Use correct params override var name * Fix wrong type size for Q8_K Better handling of original style metadata * Set default value for gguf add_tensor raw_shape KW arg
This commit is contained in:
52
gguf.py
52
gguf.py
@@ -5,7 +5,7 @@ import tempfile
|
||||
import numpy as np
|
||||
|
||||
from enum import IntEnum, auto
|
||||
from typing import Any, IO, List
|
||||
from typing import Any, IO, List, Optional
|
||||
|
||||
#
|
||||
# constants
|
||||
@@ -325,8 +325,20 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
|
||||
|
||||
|
||||
class GGMLQuantizationType(IntEnum):
|
||||
F32 = 0
|
||||
F16 = 1
|
||||
F32 = 0
|
||||
F16 = 1
|
||||
Q4_0 = 2
|
||||
Q4_1 = 3
|
||||
Q5_0 = 6
|
||||
Q5_1 = 7
|
||||
Q8_0 = 8
|
||||
Q8_1 = 9
|
||||
Q2_K = 10
|
||||
Q3_K = 11
|
||||
Q4_K = 12
|
||||
Q5_K = 13
|
||||
Q6_K = 14
|
||||
Q8_K = 15
|
||||
|
||||
|
||||
class GGUFValueType(IntEnum):
|
||||
@@ -359,7 +371,7 @@ class GGUFValueType(IntEnum):
|
||||
|
||||
|
||||
class GGUFWriter:
|
||||
def __init__(self, path: str, arch: str):
|
||||
def __init__(self, path: str, arch: str, use_temp_file = True):
|
||||
self.fout = open(path, "wb")
|
||||
self.arch = arch
|
||||
self.offset_tensor = 0
|
||||
@@ -369,6 +381,8 @@ class GGUFWriter:
|
||||
self.ti_data = b""
|
||||
self.ti_data_count = 0
|
||||
self.add_architecture()
|
||||
self.use_temp_file = use_temp_file
|
||||
self.tensors = []
|
||||
|
||||
def write_header_to_file(self):
|
||||
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||
@@ -476,8 +490,8 @@ class GGUFWriter:
|
||||
def ggml_pad(x: int, n: int) -> int:
|
||||
return ((x + n - 1) // n) * n
|
||||
|
||||
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
|
||||
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
|
||||
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||
|
||||
encoded_name = name.encode("utf8")
|
||||
self.ti_data += struct.pack("<I", len(encoded_name))
|
||||
@@ -486,23 +500,30 @@ class GGUFWriter:
|
||||
self.ti_data += struct.pack("<I", n_dims)
|
||||
for i in range(n_dims):
|
||||
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
|
||||
|
||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||
if raw_dtype is None:
|
||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||
else:
|
||||
dtype = raw_dtype
|
||||
self.ti_data += struct.pack("<I", dtype)
|
||||
self.ti_data += struct.pack("<Q", self.offset_tensor)
|
||||
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
||||
self.ti_data_count += 1
|
||||
|
||||
def add_tensor(self, name: str, tensor: np.ndarray):
|
||||
if not hasattr(self, "temp_file"):
|
||||
def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
|
||||
if self.use_temp_file and not hasattr(self, "temp_file"):
|
||||
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
||||
self.temp_file.seek(0)
|
||||
|
||||
self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
|
||||
self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
|
||||
|
||||
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
||||
|
||||
if not self.use_temp_file:
|
||||
self.tensors.append((tensor, pad))
|
||||
return
|
||||
|
||||
tensor.tofile(self.temp_file)
|
||||
|
||||
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
||||
if pad != 0:
|
||||
self.temp_file.write(bytes([0] * pad))
|
||||
|
||||
@@ -524,6 +545,13 @@ class GGUFWriter:
|
||||
if pad != 0:
|
||||
self.fout.write(bytes([0] * pad))
|
||||
|
||||
if not self.use_temp_file:
|
||||
for (currtensor, currpad) in self.tensors:
|
||||
currtensor.tofile(self.fout)
|
||||
if currpad != 0:
|
||||
self.fout.write(bytes([0] * currpad))
|
||||
return
|
||||
|
||||
self.temp_file.seek(0)
|
||||
|
||||
shutil.copyfileobj(self.temp_file, self.fout)
|
||||
|
||||
Reference in New Issue
Block a user