mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	py : improve BPE tokenizer support (#5189)
This commit is contained in:
		| @@ -334,7 +334,10 @@ class Params: | ||||
| class BpeVocab: | ||||
|     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: | ||||
|         self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) | ||||
|         self.vocab = self.bpe_tokenizer["model"]["vocab"] | ||||
|         try: | ||||
|             self.vocab = self.bpe_tokenizer["model"]["vocab"] | ||||
|         except: | ||||
|             self.vocab = self.bpe_tokenizer | ||||
|         added_tokens: dict[str, int] | ||||
|         if fname_added_tokens is not None: | ||||
|             # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Sang-Kil Park
					Sang-Kil Park