mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	tests : use new tokenizer type API (#2692)
* Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment * Improve commentary * Use token type API in test-tokenizer-1.cpp
This commit is contained in:
		
							
								
								
									
										4
									
								
								convert.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										4
									
								
								convert.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							| @@ -741,6 +741,8 @@ class OutputFile: | ||||
|         tokens = [] | ||||
|         scores = [] | ||||
|         toktypes = [] | ||||
|         # NOTE: `all_tokens` returns the the base vocabulary and added tokens | ||||
|         # TODO: add special tokens? | ||||
|         for text, score, toktype in vocab.all_tokens(): | ||||
|             tokens.append(text) | ||||
|             scores.append(score) | ||||
| @@ -751,8 +753,6 @@ class OutputFile: | ||||
|         self.gguf.add_token_scores(scores) | ||||
|         self.gguf.add_token_types(toktypes) | ||||
|  | ||||
|         # TODO: added / special tokens | ||||
|  | ||||
|     def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: | ||||
|         n_elements = 1 | ||||
|         for dim in tensor.shape: | ||||
|   | ||||
| @@ -87,8 +87,8 @@ int main(int argc, char **argv) { | ||||
|                 return 2; | ||||
|             } | ||||
|         } else { | ||||
|             // TODO: needs access to token types | ||||
|             if (0 <= i && i < 259) { | ||||
|             llama_token_type type = llama_token_get_type(ctx, i); | ||||
|             if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) { | ||||
|                 fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n", | ||||
|                     __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str()); | ||||
|             } else { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 goerch
					goerch