From 64387f6e95434b393ac3df285864692b7fd9c4d2 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com> Date: Thu, 28 Aug 2025 10:56:41 +0200 Subject: [PATCH] gguf-py: byteswapping improvements (#12851) * gguf-py: implement byteswapping for Q4_0 This is needed to byteswap Mistral model. Also restore original shapes after byteswapping tensors. It is not needed at the moment, but do it in case they'd be used in future. * Rework byteswapping code in gguf-py Move out details from byteswapping tensor blocks code --- gguf-py/gguf/scripts/gguf_convert_endian.py | 132 ++++++++++---------- 1 file changed, 68 insertions(+), 64 deletions(-) diff --git a/gguf-py/gguf/scripts/gguf_convert_endian.py b/gguf-py/gguf/scripts/gguf_convert_endian.py index 0e0febaa79..211a3f536a 100755 --- a/gguf-py/gguf/scripts/gguf_convert_endian.py +++ b/gguf-py/gguf/scripts/gguf_convert_endian.py @@ -19,6 +19,61 @@ import gguf logger = logging.getLogger("gguf-convert-endian") +def byteswap_q4_0(tensor, block_offs): + # Each block_q4_0 consists of an f16 delta (scaling factor) followed by 16 int8 quantizations. + + # Byte-Swap f16 sized delta field + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + +def byteswap_q8_0(tensor, block_offs): + # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations. + + # Byte-Swap f16 sized delta field + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + +def byteswap_q4_k(tensor, block_offs): + # Each block_q4_k consists of 2 f16 values followed by 140 int8 values. + + # Byte-Swap f16 sized fields + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + +def byteswap_q6_k(tensor, block_offs): + # Each block_q6_k consists of 208 int8 values followed by 1 f16 value. + + # Byte-Swap f16 sized field + delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + +byteswap_tensors = { + gguf.GGMLQuantizationType.Q4_0: { + "block_size": 18, # 18 bytes = + 16 * + "byteswap_func": byteswap_q4_0, + }, + gguf.GGMLQuantizationType.Q8_0: { + "block_size": 34, # 34 bytes = + 32 * + "byteswap_func": byteswap_q8_0, + }, + gguf.GGMLQuantizationType.Q4_K: { + "block_size": 144, # 144 bytes = 2 * + 140 * + "byteswap_func": byteswap_q4_k, + }, + gguf.GGMLQuantizationType.Q6_K: { + "block_size": 210, # 210 bytes = + 208 * + "byteswap_func": byteswap_q6_k, + }, +} + + def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: file_endian = reader.endianess.name if reader.byte_order == 'S': @@ -32,13 +87,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None sys.exit(0) logger.info("* Checking tensors for conversion compatibility") for tensor in reader.tensors: - if tensor.tensor_type not in ( - gguf.GGMLQuantizationType.F32, - gguf.GGMLQuantizationType.F16, - gguf.GGMLQuantizationType.Q8_0, - gguf.GGMLQuantizationType.Q4_K, - gguf.GGMLQuantizationType.Q6_K, - ): + if tensor.tensor_type not in byteswap_tensors and \ + tensor.tensor_type not in ( + gguf.GGMLQuantizationType.F32, + gguf.GGMLQuantizationType.F16, + ): raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") logger.info(f"* Preparing to convert from {file_endian} to {order}") if args.dry_run: @@ -72,78 +125,29 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None part.byteswap(inplace=True) # Byte-swap tensor data if necessary - if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0: - # Handle Q8_0 tensor blocks (block_q8_0) - # Specific handling of block_q8_0 is required. - # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations. - - block_size = 34 # 34 bytes = + 32 * - - n_blocks = len(tensor.data) // block_size - for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): - block_offs = block_num * block_size - - # Byte-Swap f16 sized delta field - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) - - # Byte-Swap Q8 weights - if block_num % 100000 == 0: - inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") - - elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K: - # Handle Q4_K tensor blocks (block_q4_k) - # Specific handling of block_q4_k is required. - # Each block_q4_k consists of 2 f16 values followed by 140 int8 values. - + if tensor.tensor_type in byteswap_tensors: # first flatten structure + oldshape = tensor.data.shape newshape = 1 for i in tensor.data.shape: newshape *= i tensor.data.resize(newshape) - block_size = 144 + block_size = byteswap_tensors[tensor.tensor_type]["block_size"] + byteswap_func = byteswap_tensors[tensor.tensor_type]["byteswap_func"] + n_blocks = len(tensor.data) // block_size for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): block_offs = block_num * block_size - # Byte-Swap f16 sized fields - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) + byteswap_func(tensor, block_offs) - delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16) - delta.byteswap(inplace=True) - - # Byte-Swap - if block_num % 100000 == 0: - inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") - - elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K: - # Handle Q6_K tensor blocks (block_q6_k) - # Specific handling of block_q6_k is required. - # Each block_q6_k consists of 208 int8 values followed by 1 f16 value. - - # first flatten structure - newshape = 1 - for i in tensor.data.shape: - newshape *= i - - tensor.data.resize(newshape) - - block_size = 210 - n_blocks = len(tensor.data) // block_size - for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): - block_offs = block_num * block_size - - # Byte-Swap f16 sized field - delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16) - delta.byteswap(inplace=True) - - # Byte-Swap if block_num % 100000 == 0: inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") + # restore old shape in case it's ever used + tensor.data.resize(oldshape) else: # Handle other tensor types tensor.data.byteswap(inplace=True)