mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	convert : fix broken sentencepiece vocab (#14416)
This commit is contained in:
		| @@ -936,7 +936,11 @@ class TextModel(ModelBase): | |||||||
|         scores: list[float] = [-10000.0] * vocab_size |         scores: list[float] = [-10000.0] * vocab_size | ||||||
|         toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size |         toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size | ||||||
|  |  | ||||||
|         for token_id in range(vocab_size): |         for token_id in range(tokenizer.vocab_size()): | ||||||
|  |             if token_id >= vocab_size: | ||||||
|  |                 logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') | ||||||
|  |                 break | ||||||
|  |  | ||||||
|             piece = tokenizer.IdToPiece(token_id) |             piece = tokenizer.IdToPiece(token_id) | ||||||
|             text = piece.encode("utf-8") |             text = piece.encode("utf-8") | ||||||
|             score = tokenizer.GetScore(token_id) |             score = tokenizer.GetScore(token_id) | ||||||
| @@ -951,10 +955,6 @@ class TextModel(ModelBase): | |||||||
|             elif tokenizer.IsByte(token_id): |             elif tokenizer.IsByte(token_id): | ||||||
|                 toktype = SentencePieceTokenTypes.BYTE |                 toktype = SentencePieceTokenTypes.BYTE | ||||||
|  |  | ||||||
|             if token_id >= vocab_size: |  | ||||||
|                 logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') |  | ||||||
|                 break |  | ||||||
|  |  | ||||||
|             tokens[token_id] = text |             tokens[token_id] = text | ||||||
|             scores[token_id] = score |             scores[token_id] = score | ||||||
|             toktypes[token_id] = toktype |             toktypes[token_id] = toktype | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Sigbjørn Skjæret
					Sigbjørn Skjæret