mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			182 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			182 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Convert a LLaMA model checkpoint to a ggml compatible file
 | |
| #
 | |
| # Load the model using Torch
 | |
| # Iterate over all variables and write them to a binary file.
 | |
| #
 | |
| # For each variable, write the following:
 | |
| #   - Number of dimensions (int)
 | |
| #   - Name length (int)
 | |
| #   - Dimensions (int[n_dims])
 | |
| #   - Name (char[name_length])
 | |
| #   - Data (float[n_dims])
 | |
| #
 | |
| # At the start of the ggml file we write the model parameters
 | |
| # and vocabulary.
 | |
| #
 | |
| 
 | |
| import argparse
 | |
| import os
 | |
| import sys
 | |
| import json
 | |
| import struct
 | |
| import numpy as np
 | |
| import torch
 | |
| 
 | |
| from sentencepiece import SentencePieceProcessor
 | |
| 
 | |
| def parse_args():
 | |
| 
 | |
|     parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
 | |
|     parser.add_argument('dir_model',  help='directory containing the model checkpoint')
 | |
|     parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
 | |
|     parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
 | |
|     return parser.parse_args()
 | |
| 
 | |
| def get_n_parts(dim):
 | |
| 
 | |
|     mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
 | |
|     n_parts = mappings.get(dim)
 | |
|     if n_parts is None:
 | |
|         print(f"Invalid dim: {dim}")
 | |
|         sys.exit(1)
 | |
| 
 | |
|     print(f"n_parts = {n_parts}\n")
 | |
|     return n_parts
 | |
| 
 | |
| def load_hparams_and_tokenizer(dir_model):
 | |
| 
 | |
|     # `dir_model` is something like `models/7B` or `models/7B/`.
 | |
|     # "tokenizer.model" is expected under model's parent dir.
 | |
|     # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
 | |
|     # Let's use the model's parent dir directly.
 | |
|     model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
 | |
| 
 | |
|     fname_hparams = f"{dir_model}/params.json"
 | |
|     fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
 | |
| 
 | |
|     with open(fname_hparams, "r") as f:
 | |
|         hparams = json.load(f)
 | |
|         print(hparams)
 | |
| 
 | |
|     tokenizer = SentencePieceProcessor(fname_tokenizer)
 | |
|     hparams.update({"vocab_size": tokenizer.vocab_size()})
 | |
| 
 | |
|     return hparams, tokenizer
 | |
| 
 | |
| def write_header(fout, hparams, ftype):
 | |
| 
 | |
|     keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 | |
|     values = [
 | |
|         0x67676d66,  # magic: ggmf in hex
 | |
|         1, # file version
 | |
|         *[hparams[key] for key in keys],
 | |
|         hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
 | |
|         ftype
 | |
|     ]
 | |
|     fout.write(struct.pack("i" * len(values), *values))
 | |
| 
 | |
| def write_tokens(fout, tokenizer):
 | |
| 
 | |
|     for i in range(tokenizer.vocab_size()):
 | |
|         if tokenizer.is_unknown(i):
 | |
|             text = " \u2047 ".encode("utf-8")
 | |
|         elif tokenizer.is_control(i):
 | |
|             text = b""
 | |
|         elif tokenizer.is_byte(i):
 | |
|             piece = tokenizer.id_to_piece(i)
 | |
|             if len(piece) != 6:
 | |
|                 print(f"Invalid token: {piece}")
 | |
|                 sys.exit(1)
 | |
|             byte_value = int(piece[3:-1], 16)
 | |
|             text = struct.pack("B", byte_value)
 | |
|         else:
 | |
|             text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
 | |
|         fout.write(struct.pack("i", len(text)))
 | |
|         fout.write(text)
 | |
|         fout.write(struct.pack("f", tokenizer.get_score(i)))
 | |
| 
 | |
| def process_and_write_variables(fout, model, ftype):
 | |
| 
 | |
|     for name, datao in model.items():
 | |
| 
 | |
|         if name.endswith("freqs"):
 | |
|             continue
 | |
| 
 | |
|         shape = datao.shape
 | |
| 
 | |
|         print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
 | |
| 
 | |
|         data = datao.numpy().squeeze()
 | |
|         n_dims = len(shape)
 | |
| 
 | |
|         # default type is fp16
 | |
|         ftype_cur = 1
 | |
|         if ftype == 0 or n_dims == 1:
 | |
|             print("  Converting to float32")
 | |
|             data = data.astype(np.float32)
 | |
|             ftype_cur = 0
 | |
| 
 | |
|         # header
 | |
|         sname = name.encode('utf-8')
 | |
|         fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
 | |
|         for dim in reversed(data.shape):
 | |
|             fout.write(struct.pack("i", dim))
 | |
|         fout.write(sname)
 | |
| 
 | |
|         # data output to file
 | |
|         data.tofile(fout)
 | |
| 
 | |
| def main():
 | |
| 
 | |
|     args = parse_args()
 | |
|     dir_model = args.dir_model
 | |
|     ftype = args.ftype
 | |
|     ftype_str = ["f32", "f16"]
 | |
| 
 | |
|     hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
 | |
| 
 | |
|     print(args)
 | |
| 
 | |
|     # if only writing vocab to file
 | |
|     if args.vocab_only:
 | |
| 
 | |
|         fname_model = f"{dir_model}/consolidated.00.pth"
 | |
|         fname_out = f"{dir_model}/ggml-vocab.bin"
 | |
| 
 | |
|         print(f"Extracting only the vocab from '{fname_model}'\n")
 | |
| 
 | |
|         model = torch.load(fname_model, map_location="cpu")
 | |
| 
 | |
|         with open(fname_out, "wb") as fout:
 | |
|             fout.write(struct.pack("i", hparams["vocab_size"]))
 | |
|             write_tokens(fout, tokenizer)
 | |
| 
 | |
|         del model
 | |
| 
 | |
|         print(f"Done. Output file: {fname_out}\n")
 | |
| 
 | |
|         return
 | |
| 
 | |
|     n_parts = get_n_parts(hparams["dim"])
 | |
| 
 | |
|     for p in range(n_parts):
 | |
| 
 | |
|         print(f"Processing part {p}\n")
 | |
| 
 | |
|         fname_model = f"{dir_model}/consolidated.0{p}.pth"
 | |
|         fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
 | |
| 
 | |
|         model = torch.load(fname_model, map_location="cpu")
 | |
| 
 | |
|         with open(fname_out, "wb") as fout:
 | |
|             write_header(fout, hparams, ftype)
 | |
|             write_tokens(fout, tokenizer)
 | |
|             process_and_write_variables(fout, model, ftype)
 | |
| 
 | |
|         del model
 | |
| 
 | |
|         print(f"Done. Output file: {fname_out}, (part {p})\n")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 | 
