mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	tests : use new tokenizer type API (#2692)
* Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment * Improve commentary * Use token type API in test-tokenizer-1.cpp
This commit is contained in:
		
							
								
								
									
										4
									
								
								convert.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										4
									
								
								convert.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							| @@ -741,6 +741,8 @@ class OutputFile: | |||||||
|         tokens = [] |         tokens = [] | ||||||
|         scores = [] |         scores = [] | ||||||
|         toktypes = [] |         toktypes = [] | ||||||
|  |         # NOTE: `all_tokens` returns the the base vocabulary and added tokens | ||||||
|  |         # TODO: add special tokens? | ||||||
|         for text, score, toktype in vocab.all_tokens(): |         for text, score, toktype in vocab.all_tokens(): | ||||||
|             tokens.append(text) |             tokens.append(text) | ||||||
|             scores.append(score) |             scores.append(score) | ||||||
| @@ -751,8 +753,6 @@ class OutputFile: | |||||||
|         self.gguf.add_token_scores(scores) |         self.gguf.add_token_scores(scores) | ||||||
|         self.gguf.add_token_types(toktypes) |         self.gguf.add_token_types(toktypes) | ||||||
|  |  | ||||||
|         # TODO: added / special tokens |  | ||||||
|  |  | ||||||
|     def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: |     def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: | ||||||
|         n_elements = 1 |         n_elements = 1 | ||||||
|         for dim in tensor.shape: |         for dim in tensor.shape: | ||||||
|   | |||||||
| @@ -87,8 +87,8 @@ int main(int argc, char **argv) { | |||||||
|                 return 2; |                 return 2; | ||||||
|             } |             } | ||||||
|         } else { |         } else { | ||||||
|             // TODO: needs access to token types |             llama_token_type type = llama_token_get_type(ctx, i); | ||||||
|             if (0 <= i && i < 259) { |             if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) { | ||||||
|                 fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n", |                 fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n", | ||||||
|                     __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str()); |                     __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str()); | ||||||
|             } else { |             } else { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 goerch
					goerch