mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	 cbef542879
			
		
	
	cbef542879
	
	
	
		
			
			- use f-strings where possible - drop first param of encode/decode functions since "utf-8" is the default
		
			
				
	
	
		
			312 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			312 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
 | |
| #
 | |
| # We caused a breaking change to the file format on 2023-03-30 in:
 | |
| #     https://github.com/ggerganov/llama.cpp/pull/613
 | |
| #
 | |
| # (1) If you still have the Meta LLaMA .pth files, then close this
 | |
| #     file now; you can just run `convert-pth-to-ggml.py` again to
 | |
| #     migrate to the new format. The tool is easier to use too. It
 | |
| #     isn't necessary anymore to manage split output files because
 | |
| #     the new format always combines things into a single file.
 | |
| #
 | |
| # (2) If you deleted the Meta LLaMA .pth files due to save on disk
 | |
| #     space, then this tool is intended to help you.  Please check
 | |
| #     out the instructions below.
 | |
| #
 | |
| # USAGE
 | |
| #
 | |
| #     python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
 | |
| #
 | |
| # PREREQUISITES
 | |
| #
 | |
| #     pip install numpy
 | |
| #     cd llama.cpp
 | |
| #     make -j4
 | |
| #
 | |
| # EXAMPLE (7B MODEL)
 | |
| #
 | |
| #     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
 | |
| #     python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
 | |
| #
 | |
| #     # check that it works
 | |
| #     ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
 | |
| #
 | |
| #     # you can delete the old files
 | |
| #     rm -f models/7B/ggml-model-f16.bin
 | |
| #     mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
 | |
| #
 | |
| # EXAMPLE (13B MODEL)
 | |
| #
 | |
| #     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
 | |
| #     python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
 | |
| #
 | |
| #     # check that it works
 | |
| #     ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
 | |
| #
 | |
| #     # you can delete the old files
 | |
| #     rm -f models/13B/ggml-model-f16.bin*
 | |
| #     mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
 | |
| #
 | |
| 
 | |
| import argparse
 | |
| import os
 | |
| import sys
 | |
| import json
 | |
| import struct
 | |
| import numpy as np
 | |
| 
 | |
| QK = 32
 | |
| 
 | |
| GGML_TYPE_Q4_0  = 0
 | |
| GGML_TYPE_Q4_1  = 1
 | |
| GGML_TYPE_I8    = 2
 | |
| GGML_TYPE_I16   = 3
 | |
| GGML_TYPE_I32   = 4
 | |
| GGML_TYPE_F16   = 5
 | |
| GGML_TYPE_F32   = 6
 | |
| 
 | |
| WTYPE_NAMES = {
 | |
|     0: "F32",
 | |
|     1: "F16",
 | |
|     2: "Q4_0",
 | |
|     3: "Q4_1",
 | |
| }
 | |
| 
 | |
| WTYPES = {
 | |
|     0: GGML_TYPE_F32,
 | |
|     1: GGML_TYPE_F16,
 | |
|     2: GGML_TYPE_Q4_0,
 | |
|     3: GGML_TYPE_Q4_1,
 | |
| }
 | |
| 
 | |
| GGML_BLCK_SIZE = {
 | |
|     GGML_TYPE_Q4_0:  QK,
 | |
|     GGML_TYPE_Q4_1:  QK,
 | |
|     GGML_TYPE_I8:    1,
 | |
|     GGML_TYPE_I16:   1,
 | |
|     GGML_TYPE_I32:   1,
 | |
|     GGML_TYPE_F16:   1,
 | |
|     GGML_TYPE_F32:   1,
 | |
| }
 | |
| 
 | |
| GGML_TYPE_SIZE = {
 | |
|     GGML_TYPE_Q4_0: 4   + QK//2,
 | |
|     GGML_TYPE_Q4_1: 4*2 + QK//2,
 | |
|     GGML_TYPE_I8:   1,
 | |
|     GGML_TYPE_I16:  2,
 | |
|     GGML_TYPE_I32:  4,
 | |
|     GGML_TYPE_F16:  2,
 | |
|     GGML_TYPE_F32:  4,
 | |
| }
 | |
| 
 | |
| HPARAMS = [
 | |
|     'magic',    # int32
 | |
|     'version',  # int32
 | |
|     'n_vocab',  # int32
 | |
|     'n_embd',   # int32
 | |
|     'n_mult',   # int32
 | |
|     'n_head',   # int32
 | |
|     'n_layer',  # int32
 | |
|     'n_rot',    # int32
 | |
|     'f16',      # int32
 | |
| ]
 | |
| 
 | |
| def read_hparams(fin):
 | |
|     struct_fmt = "i" * len(HPARAMS)
 | |
|     struct_size = struct.calcsize(struct_fmt)
 | |
|     buf = fin.read(struct_size)
 | |
|     ints = struct.unpack(struct_fmt, buf)
 | |
|     hparams = dict(zip(HPARAMS, ints))
 | |
|     return hparams
 | |
| 
 | |
| def write_hparams(fout, hparams):
 | |
|     struct_fmt = "i" * len(HPARAMS)
 | |
|     struct_size = struct.calcsize(struct_fmt)
 | |
|     ints = [hparams[h] for h in HPARAMS]
 | |
|     fout.write(struct.pack(struct_fmt, *ints))
 | |
| 
 | |
| def read_tokens(fin, hparams):
 | |
|     tokens = []
 | |
|     for i in range(hparams['n_vocab']):
 | |
|         len_b = fin.read(4)
 | |
|         (length,) = struct.unpack("i", len_b)
 | |
|         word = fin.read(length)
 | |
|         score_b = fin.read(4)
 | |
|         (score,) = struct.unpack("f", score_b)
 | |
|         tokens.append((word, score))
 | |
|     return tokens
 | |
| 
 | |
| def write_tokens(fout, tokens):
 | |
|     for word, score in tokens:
 | |
|         fout.write(struct.pack("i", len(word)))
 | |
|         fout.write(word)
 | |
|         fout.write(struct.pack("f", score))
 | |
| 
 | |
| def ggml_nelements(shape):
 | |
|     r = 1
 | |
|     for i in shape:
 | |
|         r *= i
 | |
|     return r
 | |
| 
 | |
| def ggml_nbytes(shape, ftype):
 | |
|     x = ggml_nelements(shape)
 | |
|     t = WTYPES[ftype]
 | |
|     x *= GGML_TYPE_SIZE[t]
 | |
|     x //= GGML_BLCK_SIZE[t]
 | |
|     return x
 | |
| 
 | |
| def copy_tensors(fin, fout, part_id, n_parts):
 | |
|     while True:
 | |
| 
 | |
|         b = fin.read(4)
 | |
|         if not b: break
 | |
|         (n_dims,) = struct.unpack("i", b)
 | |
|         b = fin.read(4)
 | |
|         (length,) = struct.unpack("i", b)
 | |
|         b = fin.read(4)
 | |
|         (ftype,) = struct.unpack("i", b)
 | |
| 
 | |
|         assert n_dims in (1, 2)
 | |
| 
 | |
|         partshape = list(range(n_dims))
 | |
|         for i in range(n_dims):
 | |
|             b = fin.read(4)
 | |
|             partshape[i] = struct.unpack("i", b)[0]
 | |
|         partshape = list(reversed(partshape))
 | |
| 
 | |
|         name = fin.read(length)
 | |
|         data = fin.read(ggml_nbytes(partshape, ftype))
 | |
| 
 | |
|         blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
 | |
|         type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
 | |
| 
 | |
|         print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
 | |
| 
 | |
|         # determine dimension along which multipart tensor is sharded
 | |
|         #
 | |
|         # split_dim 0 regex:
 | |
|         #   - output.*
 | |
|         #   - layers.*.attention.wq.weight
 | |
|         #   - layers.*.attention.wk.weight
 | |
|         #   - layers.*.attention.wv.weight
 | |
|         #   - layers.*.feed_forward.w1.weight
 | |
|         #   - layers.*.feed_forward.w3.weight
 | |
|         #
 | |
|         # split_dim 1 regex:
 | |
|         #   - tok_embeddings.*
 | |
|         #   - layers.*.attention.wo.weight
 | |
|         #   - layers.*.feed_forward.w2.weight
 | |
|         #
 | |
|         if n_dims > 1:
 | |
|             split_dim = 1
 | |
|             if b"tok_embeddings" in name:
 | |
|                 split_dim = 1
 | |
|             elif b"layers" in name:
 | |
|                 if b"attention.wo.weight" in name:
 | |
|                     split_dim = 1
 | |
|                 elif b"feed_forward.w2.weight" in name:
 | |
|                     split_dim = 1
 | |
|                 else:
 | |
|                     split_dim = 0
 | |
|             elif b"output" in name:
 | |
|                 split_dim = 0
 | |
| 
 | |
|         # output tensor header
 | |
|         fullshape = list(partshape)
 | |
|         if n_dims > 1:
 | |
|             fullshape[split_dim] *= n_parts
 | |
|         fout.write(struct.pack("iii", n_dims, len(name), ftype))
 | |
|         for dim in reversed(fullshape):
 | |
|             fout.write(struct.pack("i", dim))
 | |
|         fout.write(name)
 | |
| 
 | |
|         # ensure tensor data is aligned
 | |
|         tensor_data_offset = fout.tell()
 | |
|         while tensor_data_offset % QK != 0:
 | |
|             fout.write(struct.pack("B", 0))
 | |
|             tensor_data_offset += 1
 | |
| 
 | |
|         # output unified mappable tensor data
 | |
|         if n_dims == 1 or n_parts == 1:
 | |
|             # copy tensor which we thankfully received in one piece
 | |
|             if part_id == 0:
 | |
|                 fout.write(data)
 | |
|         elif split_dim == 0:
 | |
|             # reassemble multifile tensor containing some of the rows
 | |
|             rows_per_chunk = partshape[0]
 | |
|             current_row = part_id * rows_per_chunk
 | |
|             bytes_per_row = fullshape[1] // blck_size * type_size
 | |
|             offset = current_row * bytes_per_row
 | |
|             fout.seek(tensor_data_offset + offset)
 | |
|             fout.write(data)
 | |
|         elif split_dim == 1:
 | |
|             # reassemble multifile tensor containing some of the cols
 | |
|             cols_per_chunk = partshape[1]
 | |
|             current_col = part_id * cols_per_chunk
 | |
|             bpr = partshape[1] // blck_size * type_size
 | |
|             bytes_per_row = fullshape[1] // blck_size * type_size
 | |
|             offset_current_col = current_col // blck_size * type_size
 | |
|             for row in range(partshape[0]):
 | |
|                 offset_row = row * bytes_per_row
 | |
|                 offset = offset_row + offset_current_col
 | |
|                 fout.seek(tensor_data_offset + offset)
 | |
|                 fout.write(data[row * bpr:row * bpr + bpr])
 | |
| 
 | |
|         # advance file position to next tensor
 | |
|         fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
 | |
| 
 | |
| def parse_args():
 | |
|     parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
 | |
|     parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
 | |
|     parser.add_argument('fout_path', help='your new ggjt file name')
 | |
|     return parser.parse_args()
 | |
| 
 | |
| def main():
 | |
|     args = parse_args()
 | |
|     assert args.fin_path
 | |
|     assert args.fout_path
 | |
|     assert args.fin_path != args.fout_path
 | |
| 
 | |
|     with open(args.fin_path, "rb") as fin:
 | |
|         hparams = read_hparams(fin)
 | |
|         tokens = read_tokens(fin, hparams)
 | |
| 
 | |
|     if hparams['magic'] == 0x67676a74:  # ggjt
 | |
|         print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
 | |
|         sys.exit(1)
 | |
| 
 | |
|     if hparams['magic'] != 0x67676d66:  # ggmf
 | |
|         print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
 | |
|         sys.exit(1)
 | |
| 
 | |
|     hparams['magic'] = 0x67676a74  # ggjt
 | |
| 
 | |
|     # count number of multipart files by convention
 | |
|     n_parts = 1
 | |
|     while True:
 | |
|         if os.path.exists(f"{args.fin_path}.{n_parts}"):
 | |
|             n_parts += 1
 | |
|         else:
 | |
|             break
 | |
| 
 | |
|     # we output a single file for ggml
 | |
|     with open(args.fout_path, "wb") as fout:
 | |
|         write_hparams(fout, hparams)
 | |
|         write_tokens(fout, tokens)
 | |
|         offset_of_tensors = fout.tell()
 | |
|         # the tensors we load could be split across multiple files
 | |
|         for part_id in range(n_parts):
 | |
|             fout.seek(offset_of_tensors)
 | |
|             print(f"Processing part {part_id+1} of {n_parts}\n")
 | |
|             fin_path = args.fin_path
 | |
|             if part_id > 0:
 | |
|                 fin_path += f".{part_id}"
 | |
|             with open(fin_path, "rb") as fin:
 | |
|                 read_tokens(fin, read_hparams(fin))
 | |
|                 copy_tensors(fin, fout, part_id, n_parts)
 | |
| 
 | |
|     print(f"Done. Output file: {args.fout_path}\n")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 |