mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Improve handling of special tokens in GGML to GGUF converter (#2725)
* Improve UNK, BOS, EOS token handling when converting without metadata. * Allow importing as a module. * Remove some obsolete code and minor cleanups. * Set default UNK token mapping from -1 to 0 in llama.cpp * Try to handle overflow due to buggy Windows Python with a better error message
This commit is contained in:
		| @@ -1,10 +1,12 @@ | ||||
| import sys, struct, math, argparse | ||||
| import sys, struct, math, argparse, warnings | ||||
| from pathlib import Path | ||||
|  | ||||
| import numpy as np | ||||
|  | ||||
| import gguf | ||||
|  | ||||
| warnings.filterwarnings('error') | ||||
|  | ||||
| # Note: Does not support GGML_QKK_64 | ||||
| QK_K = 256 | ||||
| # Items here are (block size, type size) | ||||
| @@ -215,15 +217,10 @@ class GGMLToGGUF: | ||||
|         if self.vocab_override is not None: | ||||
|             vo = self.vocab_override | ||||
|             print('* Adding vocab item(s)') | ||||
|             for (idx, vitem) in enumerate(vo.all_tokens()): | ||||
|                 if len(vitem) == 3: | ||||
|                     tokens.append(vitem[0]) | ||||
|                     scores.append(vitem[1]) | ||||
|                     toktypes.append(vitem[2]) | ||||
|                 else: | ||||
|                     # Maybe try to guess the token type here? | ||||
|                     tokens.append(vitem[0]) | ||||
|                     scores.append(vitem[1]) | ||||
|             for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): | ||||
|                 tokens.append(vbytes) | ||||
|                 scores.append(score) | ||||
|                 toktypes.append(ttype) | ||||
|             assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' | ||||
|             gguf_writer.add_token_list(tokens) | ||||
|             gguf_writer.add_token_scores(scores) | ||||
| @@ -231,9 +228,21 @@ class GGMLToGGUF: | ||||
|                 gguf_writer.add_token_types(toktypes) | ||||
|             return | ||||
|         print(f'* Adding {hp.n_vocab} vocab item(s)') | ||||
|         assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' | ||||
|         for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): | ||||
|             tt = 1 # Normal | ||||
|             if len(vbytes) == 0: | ||||
|             # Special handling for UNK, BOS, EOS tokens. | ||||
|             if tokid <= 2: | ||||
|                 if tokid == 0: | ||||
|                     vbytes = b'<unk>' | ||||
|                     tt = 2 | ||||
|                 elif tokid == 1: | ||||
|                     vbytes = b'<s>' | ||||
|                     tt = 3 | ||||
|                 else: | ||||
|                     vbytes = b'</s>' | ||||
|                     tt = 3 | ||||
|             elif len(vbytes) == 0: | ||||
|                 tt = 3 # Control | ||||
|             elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1: | ||||
|                 vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8') | ||||
| @@ -246,6 +255,9 @@ class GGMLToGGUF: | ||||
|         gguf_writer.add_token_list(tokens) | ||||
|         gguf_writer.add_token_scores(scores) | ||||
|         gguf_writer.add_token_types(toktypes) | ||||
|         gguf_writer.add_unk_token_id(0) | ||||
|         gguf_writer.add_bos_token_id(1) | ||||
|         gguf_writer.add_eos_token_id(2) | ||||
|  | ||||
|     def add_tensors(self, gguf_writer): | ||||
|         nm = self.name_map | ||||
| @@ -315,7 +327,11 @@ def main(): | ||||
|     data = np.memmap(cfg.input, mode = 'r') | ||||
|     model = GGMLV3Model() | ||||
|     print('* Scanning GGML input file') | ||||
|     offset = model.load(data, 0) | ||||
|     try: | ||||
|         offset = model.load(data, 0) | ||||
|     except OverflowError: | ||||
|         print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr) | ||||
|         raise | ||||
|     print(f'* GGML model hyperparameters: {model.hyperparameters}') | ||||
|     vocab_override = None | ||||
|     params_override = None | ||||
| @@ -330,4 +346,5 @@ def main(): | ||||
|     converter.save() | ||||
|     print(f'* Successful completion. Output saved to: {cfg.output}') | ||||
|  | ||||
| main() | ||||
| if __name__ == '__main__': | ||||
|     main() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kerfuffle
					Kerfuffle