mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	- use f-strings where possible - drop first param of encode/decode functions since "utf-8" is the default
		
			
				
	
	
		
			275 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			275 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# Convert a LLaMA model checkpoint to a ggjt compatible file
 | 
						|
#
 | 
						|
# Load the model using Torch
 | 
						|
# Iterate over all variables and write them to a binary file.
 | 
						|
#
 | 
						|
# For each variable, write the following:
 | 
						|
#   - Number of dimensions (int)
 | 
						|
#   - Name length (int)
 | 
						|
#   - Dimensions (int[n_dims])
 | 
						|
#   - Name (char[name_length])
 | 
						|
#   - Data (float[n_dims])
 | 
						|
#
 | 
						|
# At the start of the ggml file we write the model parameters
 | 
						|
# and vocabulary.
 | 
						|
#
 | 
						|
 | 
						|
import argparse
 | 
						|
import os
 | 
						|
import sys
 | 
						|
import json
 | 
						|
import struct
 | 
						|
import numpy as np
 | 
						|
import torch
 | 
						|
 | 
						|
from sentencepiece import SentencePieceProcessor
 | 
						|
 | 
						|
QK = 32
 | 
						|
 | 
						|
GGML_TYPE_Q4_0  = 0
 | 
						|
GGML_TYPE_Q4_1  = 1
 | 
						|
GGML_TYPE_I8    = 2
 | 
						|
GGML_TYPE_I16   = 3
 | 
						|
GGML_TYPE_I32   = 4
 | 
						|
GGML_TYPE_F16   = 5
 | 
						|
GGML_TYPE_F32   = 6
 | 
						|
 | 
						|
WTYPES = {
 | 
						|
    0: GGML_TYPE_F32,
 | 
						|
    1: GGML_TYPE_F16,
 | 
						|
    2: GGML_TYPE_Q4_0,
 | 
						|
    3: GGML_TYPE_Q4_1,
 | 
						|
}
 | 
						|
 | 
						|
GGML_BLCK_SIZE = {
 | 
						|
    GGML_TYPE_Q4_0:  QK,
 | 
						|
    GGML_TYPE_Q4_1:  QK,
 | 
						|
    GGML_TYPE_I8:    1,
 | 
						|
    GGML_TYPE_I16:   1,
 | 
						|
    GGML_TYPE_I32:   1,
 | 
						|
    GGML_TYPE_F16:   1,
 | 
						|
    GGML_TYPE_F32:   1,
 | 
						|
}
 | 
						|
 | 
						|
GGML_TYPE_SIZE = {
 | 
						|
    GGML_TYPE_Q4_0: 4   + QK//2,
 | 
						|
    GGML_TYPE_Q4_1: 4*2 + QK//2,
 | 
						|
    GGML_TYPE_I8:   1,
 | 
						|
    GGML_TYPE_I16:  2,
 | 
						|
    GGML_TYPE_I32:  4,
 | 
						|
    GGML_TYPE_F16:  2,
 | 
						|
    GGML_TYPE_F32:  4,
 | 
						|
}
 | 
						|
 | 
						|
def ggml_nelements(shape):
 | 
						|
    r = 1
 | 
						|
    for i in shape:
 | 
						|
        r *= i
 | 
						|
    return r
 | 
						|
 | 
						|
def ggml_nbytes(shape, ftype):
 | 
						|
    x = ggml_nelements(shape)
 | 
						|
    t = WTYPES[ftype]
 | 
						|
    x *= GGML_TYPE_SIZE[t]
 | 
						|
    x //= GGML_BLCK_SIZE[t]
 | 
						|
    return x
 | 
						|
 | 
						|
def parse_args():
 | 
						|
    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
 | 
						|
    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
 | 
						|
    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
 | 
						|
    parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
 | 
						|
    return parser.parse_args()
 | 
						|
 | 
						|
def get_n_parts(dim):
 | 
						|
    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
 | 
						|
    n_parts = mappings.get(dim)
 | 
						|
    if n_parts is None:
 | 
						|
        print(f"Invalid dim: {dim}")
 | 
						|
        sys.exit(1)
 | 
						|
 | 
						|
    print(f"n_parts = {n_parts}\n")
 | 
						|
    return n_parts
 | 
						|
 | 
						|
def load_hparams_and_tokenizer(dir_model):
 | 
						|
    # `dir_model` is something like `models/7B` or `models/7B/`.
 | 
						|
    # "tokenizer.model" is expected under model's parent dir.
 | 
						|
    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
 | 
						|
    # Let's use the model's parent dir directly.
 | 
						|
    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
 | 
						|
    fname_hparams = f"{dir_model}/params.json"
 | 
						|
    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
 | 
						|
    with open(fname_hparams, "r") as f:
 | 
						|
        hparams = json.load(f)
 | 
						|
        print(hparams)
 | 
						|
    tokenizer = SentencePieceProcessor(fname_tokenizer)
 | 
						|
    hparams.update({"vocab_size": tokenizer.vocab_size()})
 | 
						|
    return hparams, tokenizer
 | 
						|
 | 
						|
def write_header(fout, hparams, ftype):
 | 
						|
    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 | 
						|
    values = [
 | 
						|
        0x67676a74,  # magic: ggjt in hex
 | 
						|
        1, # file version
 | 
						|
        *[hparams[key] for key in keys],
 | 
						|
        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
 | 
						|
        ftype
 | 
						|
    ]
 | 
						|
    fout.write(struct.pack("i" * len(values), *values))
 | 
						|
 | 
						|
def write_tokens(fout, tokenizer):
 | 
						|
    for i in range(tokenizer.vocab_size()):
 | 
						|
        if tokenizer.is_unknown(i):
 | 
						|
            text = " \u2047 ".encode()
 | 
						|
        elif tokenizer.is_control(i):
 | 
						|
            text = b""
 | 
						|
        elif tokenizer.is_byte(i):
 | 
						|
            piece = tokenizer.id_to_piece(i)
 | 
						|
            if len(piece) != 6:
 | 
						|
                print(f"Invalid token: {piece}")
 | 
						|
                sys.exit(1)
 | 
						|
            byte_value = int(piece[3:-1], 16)
 | 
						|
            text = struct.pack("B", byte_value)
 | 
						|
        else:
 | 
						|
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
 | 
						|
        fout.write(struct.pack("i", len(text)))
 | 
						|
        fout.write(text)
 | 
						|
        fout.write(struct.pack("f", tokenizer.get_score(i)))
 | 
						|
 | 
						|
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
 | 
						|
    for name, datao in model.items():
 | 
						|
        if name.endswith("freqs"):
 | 
						|
            continue
 | 
						|
 | 
						|
        # remove dimensions with a single element
 | 
						|
        data = datao.numpy().squeeze()
 | 
						|
        partshape = data.shape
 | 
						|
        n_dims = len(data.shape)
 | 
						|
        assert n_dims in (1, 2)
 | 
						|
 | 
						|
        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
 | 
						|
 | 
						|
        # coerce single-dimensional tensors from float16 to float32
 | 
						|
        ftype_cur = 1
 | 
						|
        if ftype == 0 or n_dims == 1:
 | 
						|
            print("  Converting to float32")
 | 
						|
            data = data.astype(np.float32)
 | 
						|
            ftype_cur = 0
 | 
						|
        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
 | 
						|
        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
 | 
						|
 | 
						|
        # determine dimension along which multipart tensor is sharded
 | 
						|
        #
 | 
						|
        # split_dim 0 regex:
 | 
						|
        #   - output.*
 | 
						|
        #   - layers.*.attention.wq.weight
 | 
						|
        #   - layers.*.attention.wk.weight
 | 
						|
        #   - layers.*.attention.wv.weight
 | 
						|
        #   - layers.*.feed_forward.w1.weight
 | 
						|
        #   - layers.*.feed_forward.w3.weight
 | 
						|
        #
 | 
						|
        # split_dim 1 regex:
 | 
						|
        #   - tok_embeddings.*
 | 
						|
        #   - layers.*.attention.wo.weight
 | 
						|
        #   - layers.*.feed_forward.w2.weight
 | 
						|
        #
 | 
						|
        if n_dims > 1:
 | 
						|
            split_dim = 1
 | 
						|
            if "tok_embeddings" in name:
 | 
						|
                split_dim = 1
 | 
						|
            elif "layers" in name:
 | 
						|
                if "attention.wo.weight" in name:
 | 
						|
                    split_dim = 1
 | 
						|
                elif "feed_forward.w2.weight" in name:
 | 
						|
                    split_dim = 1
 | 
						|
                else:
 | 
						|
                    split_dim = 0
 | 
						|
            elif "output" in name:
 | 
						|
                split_dim = 0
 | 
						|
 | 
						|
        # output tensor header
 | 
						|
        fullshape = list(partshape)
 | 
						|
        if n_dims > 1:
 | 
						|
            fullshape[split_dim] *= n_parts
 | 
						|
        sname = name.encode()
 | 
						|
        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
 | 
						|
        for dim in reversed(fullshape):
 | 
						|
            fout.write(struct.pack("i", dim))
 | 
						|
        fout.write(sname)
 | 
						|
 | 
						|
        # ensure tensor data is aligned
 | 
						|
        tensor_data_offset = fout.tell()
 | 
						|
        while tensor_data_offset % QK != 0:
 | 
						|
            fout.write(struct.pack("B", 0))
 | 
						|
            tensor_data_offset += 1
 | 
						|
 | 
						|
        # output unified mappable tensor data
 | 
						|
        if n_dims == 1 or n_parts == 1:
 | 
						|
            # copy tensor which we thankfully received in one piece
 | 
						|
            if part_id == 0:
 | 
						|
                data.tofile(fout)
 | 
						|
        elif split_dim == 0:
 | 
						|
            # reassemble multifile tensor containing some of the rows
 | 
						|
            rows_per_chunk = partshape[0]
 | 
						|
            current_row = part_id * rows_per_chunk
 | 
						|
            bytes_per_row = fullshape[1] // blck_size * type_size
 | 
						|
            offset = current_row * bytes_per_row
 | 
						|
            fout.seek(tensor_data_offset + offset)
 | 
						|
            data.tofile(fout)
 | 
						|
        elif split_dim == 1:
 | 
						|
            # reassemble multifile tensor containing some of the cols
 | 
						|
            cols_per_chunk = partshape[1]
 | 
						|
            current_col = part_id * cols_per_chunk
 | 
						|
            bytes_per_row = fullshape[1] // blck_size * type_size
 | 
						|
            offset_current_col = current_col // blck_size * type_size
 | 
						|
            for row in range(partshape[0]):
 | 
						|
                offset_row = row * bytes_per_row
 | 
						|
                offset = offset_row + offset_current_col
 | 
						|
                fout.seek(tensor_data_offset + offset)
 | 
						|
                data[row].tofile(fout)
 | 
						|
 | 
						|
        # advance file position to next tensor
 | 
						|
        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
 | 
						|
 | 
						|
def main():
 | 
						|
    args = parse_args()
 | 
						|
    dir_model = args.dir_model
 | 
						|
    ftype = args.ftype
 | 
						|
    ftype_str = ["f32", "f16"]
 | 
						|
    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
 | 
						|
 | 
						|
    print(args)
 | 
						|
 | 
						|
    # if only writing vocab to file
 | 
						|
    if args.vocab_only:
 | 
						|
        fname_model = f"{dir_model}/consolidated.00.pth"
 | 
						|
        fname_out = f"{dir_model}/ggml-vocab.bin"
 | 
						|
        print(f"Extracting only the vocab from '{fname_model}'\n")
 | 
						|
        with open(fname_out, "wb") as fout:
 | 
						|
            write_header(fout, hparams, ftype)
 | 
						|
            write_tokens(fout, tokenizer)
 | 
						|
        print(f"Done. Output file: {fname_out}\n")
 | 
						|
        return
 | 
						|
 | 
						|
    n_parts = get_n_parts(hparams["dim"])
 | 
						|
    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
 | 
						|
 | 
						|
    # we output a single file for ggml
 | 
						|
    with open(fname_out, "wb") as fout:
 | 
						|
        write_header(fout, hparams, ftype)
 | 
						|
        write_tokens(fout, tokenizer)
 | 
						|
        offset_of_tensors = fout.tell()
 | 
						|
        # the tensors we load could be split across multiple files
 | 
						|
        for part_id in range(n_parts):
 | 
						|
            fout.seek(offset_of_tensors)
 | 
						|
            print(f"Processing part {part_id+1} of {n_parts}\n")
 | 
						|
            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
 | 
						|
            model = torch.load(fname_model, map_location="cpu")
 | 
						|
            process_and_write_variables(fout, model, ftype, part_id, n_parts)
 | 
						|
            del model
 | 
						|
 | 
						|
    print(f"Done. Output file: {fname_out}\n")
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    main()
 |