mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Use tokenizer.vocab_size() instead of hardcoding 32000 in convert-pth-to-ggml.py (#142)
				
					
				
			There are ways that special tokens or other new tokens could be added to the tokenizer; therefore it's probably best not to assume the vocabulary is only 32000 tokens.
This commit is contained in:
		| @@ -99,7 +99,7 @@ for p in range(n_parts): | |||||||
|     fout.write(struct.pack("i", ftype)) |     fout.write(struct.pack("i", ftype)) | ||||||
|  |  | ||||||
|     # Is this correct?? |     # Is this correct?? | ||||||
|     for i in range(32000): |     for i in range(tokenizer.vocab_size()): | ||||||
|         if tokenizer.is_unknown(i): |         if tokenizer.is_unknown(i): | ||||||
|             # "<unk>" token (translated as ??) |             # "<unk>" token (translated as ??) | ||||||
|             text = " \u2047 ".encode("utf-8") |             text = " \u2047 ".encode("utf-8") | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Ronsor
					Ronsor