mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	 92139b90af
			
		
	
	92139b90af
	
	
	
		
			
			* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
		
			
				
	
	
		
			35 lines
		
	
	
		
			797 B
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			35 lines
		
	
	
		
			797 B
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/bin/bash
 | |
| #
 | |
| # Usage:
 | |
| #
 | |
| #   test-tokenizer-0.sh <name> <input>
 | |
| #
 | |
| 
 | |
| if [ $# -ne 2 ]; then
 | |
|     printf "Usage: $0 <name> <input>\n"
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| name=$1
 | |
| input=$2
 | |
| 
 | |
| make -j tests/test-tokenizer-0
 | |
| 
 | |
| printf "Testing %s on %s ...\n" $name $input
 | |
| 
 | |
| python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
 | |
| cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
 | |
| 
 | |
| ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
 | |
| cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
 | |
| 
 | |
| diff $input.tok $input.tokcpp > /dev/null 2>&1
 | |
| 
 | |
| if [ $? -eq 0 ]; then
 | |
|     printf "Tokenization is correct!\n"
 | |
| else
 | |
|     diff $input.tok $input.tokcpp | head -n 32
 | |
| 
 | |
|     printf "Tokenization differs!\n"
 | |
| fi
 |