mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	convert : fix vocab size when not defined in hparams (#3421)
This commit is contained in:
		| @@ -118,26 +118,19 @@ print("gguf: get tokenizer metadata") | ||||
|  | ||||
| tokens: list[bytearray] = [] | ||||
|  | ||||
| tokenizer_json_file = dir_model / 'tokenizer.json' | ||||
| if not tokenizer_json_file.is_file(): | ||||
|     print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) | ||||
|     sys.exit(1) | ||||
|  | ||||
| # gpt2 tokenizer | ||||
| gguf_writer.add_tokenizer_model("gpt2") | ||||
|  | ||||
| with open(tokenizer_json_file, "r", encoding="utf-8") as f: | ||||
|     tokenizer_json = json.load(f) | ||||
|  | ||||
| print("gguf: get gpt2 tokenizer vocab") | ||||
|  | ||||
| # The number of tokens in tokenizer.json can differ from the expected vocab size. | ||||
| # This causes downstream issues with mismatched tensor sizes when running the inference | ||||
| vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) | ||||
|  | ||||
| # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py | ||||
| tokenizer = AutoTokenizer.from_pretrained(dir_model) | ||||
|  | ||||
| # The number of tokens in tokenizer.json can differ from the expected vocab size. | ||||
| # This causes downstream issues with mismatched tensor sizes when running the inference | ||||
| vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) | ||||
| assert max(tokenizer.vocab.values()) < vocab_size | ||||
|  | ||||
| reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} | ||||
| byte_encoder = bytes_to_unicode() | ||||
| byte_decoder = {v: k for k, v in byte_encoder.items()} | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 cebtenzzre
					cebtenzzre