mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	py : add GPT4All conversion script
For now: copy-paste Too much time for me to deduplicate the python code
This commit is contained in:
		
							
								
								
									
										107
									
								
								convert-gpt4all-to-ggml.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								convert-gpt4all-to-ggml.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,107 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py | ||||||
|  | # | ||||||
|  |  | ||||||
|  | # Original by https://github.com/eiz | ||||||
|  | # https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818 | ||||||
|  | import argparse | ||||||
|  | import glob | ||||||
|  | import os | ||||||
|  | import struct | ||||||
|  | import sys | ||||||
|  | from sentencepiece import SentencePieceProcessor | ||||||
|  |  | ||||||
|  | HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] | ||||||
|  |  | ||||||
|  | def parse_args(): | ||||||
|  |     parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format') | ||||||
|  |     parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin') | ||||||
|  |     parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') | ||||||
|  |     return parser.parse_args() | ||||||
|  |  | ||||||
|  | def read_header(f_in): | ||||||
|  |     struct_fmt = "i" * (3 + len(HPARAMS)) | ||||||
|  |     struct_size = struct.calcsize(struct_fmt) | ||||||
|  |     buf = f_in.read(struct_size) | ||||||
|  |     return struct.unpack(struct_fmt, buf) | ||||||
|  |  | ||||||
|  | def write_header(f_out, header): | ||||||
|  |     (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header | ||||||
|  |  | ||||||
|  |     if magic != 0x67676d6c: | ||||||
|  |         raise Exception('Invalid file magic. Must be an old style ggml file.') | ||||||
|  |  | ||||||
|  |     values = [ | ||||||
|  |         0x67676d66, # magic: ggml in hex | ||||||
|  |         1,          # file version | ||||||
|  |         vocab_size, | ||||||
|  |         dim, | ||||||
|  |         multiple_of, | ||||||
|  |         n_heads, | ||||||
|  |         n_layers, | ||||||
|  |         rot, | ||||||
|  |         ftype | ||||||
|  |     ] | ||||||
|  |     f_out.write(struct.pack("i" * len(values), *values)) | ||||||
|  |  | ||||||
|  | def write_tokens(fout, tokenizer): | ||||||
|  |     for i in range(tokenizer.vocab_size()): | ||||||
|  |         if tokenizer.is_unknown(i): | ||||||
|  |             text = " \u2047 ".encode("utf-8") | ||||||
|  |         elif tokenizer.is_control(i): | ||||||
|  |             text = b"" | ||||||
|  |         elif tokenizer.is_byte(i): | ||||||
|  |             piece = tokenizer.id_to_piece(i) | ||||||
|  |             if len(piece) != 6: | ||||||
|  |                 print(f"Invalid token: {piece}") | ||||||
|  |                 sys.exit(1) | ||||||
|  |             byte_value = int(piece[3:-1], 16) | ||||||
|  |             text = struct.pack("B", byte_value) | ||||||
|  |         else: | ||||||
|  |             text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") | ||||||
|  |         fout.write(struct.pack("i", len(text))) | ||||||
|  |         fout.write(text) | ||||||
|  |         fout.write(struct.pack("f", tokenizer.get_score(i))) | ||||||
|  |  | ||||||
|  |     # TODO: GPT4All - add extra <pad> token | ||||||
|  |     text = "<pad>".encode("utf-8") | ||||||
|  |     fout.write(struct.pack("i", len(text))) | ||||||
|  |     fout.write(text) | ||||||
|  |     fout.write(struct.pack("f", 0.0)) | ||||||
|  |  | ||||||
|  | def read_tokens(f_in, tokenizer): | ||||||
|  |     for i in range(tokenizer.vocab_size()): | ||||||
|  |         len_b = f_in.read(4) | ||||||
|  |         (length,) = struct.unpack("i", len_b) | ||||||
|  |         f_in.read(length) | ||||||
|  |  | ||||||
|  | def copy_all_data(f_out, f_in): | ||||||
|  |     while True: | ||||||
|  |         buf = f_in.read(1024 * 1024) | ||||||
|  |         if not buf: | ||||||
|  |             break | ||||||
|  |         f_out.write(buf) | ||||||
|  |  | ||||||
|  | def convert_one_file(path_in, tokenizer): | ||||||
|  |     path_tmp = f"{path_in}.tmp" | ||||||
|  |     path_orig= f"{path_in}.orig" | ||||||
|  |     print(f"converting {path_in}") | ||||||
|  |     with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: | ||||||
|  |         write_header(f_out, read_header(f_in)) | ||||||
|  |         read_tokens(f_in, tokenizer) | ||||||
|  |         write_tokens(f_out, tokenizer) | ||||||
|  |         copy_all_data(f_out, f_in) | ||||||
|  |     os.rename(path_in, path_orig) | ||||||
|  |     os.rename(path_tmp, path_in) | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     args = parse_args() | ||||||
|  |  | ||||||
|  |     tokenizer = SentencePieceProcessor(args.tokenizer_model) | ||||||
|  |  | ||||||
|  |     convert_one_file(args.gpt4all_model, tokenizer) | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov