mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	 92139b90af
			
		
	
	92139b90af
	
	
	
		
			
			* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
		
			
				
	
	
		
			30 lines
		
	
	
		
			928 B
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			30 lines
		
	
	
		
			928 B
		
	
	
	
		
			C++
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include <cstdint>
 | |
| #include <string>
 | |
| #include <vector>
 | |
| 
 | |
| #define CODEPOINT_TYPE_UNIDENTIFIED 0
 | |
| #define CODEPOINT_TYPE_NUMBER       1
 | |
| #define CODEPOINT_TYPE_LETTER       2
 | |
| #define CODEPOINT_TYPE_WHITESPACE   3
 | |
| #define CODEPOINT_TYPE_ACCENT_MARK  4
 | |
| #define CODEPOINT_TYPE_PUNCTUATION  5
 | |
| #define CODEPOINT_TYPE_SYMBOL       6
 | |
| #define CODEPOINT_TYPE_CONTROL      7
 | |
| 
 | |
| std::string unicode_cpt_to_utf8(uint32_t cp);
 | |
| std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 | |
| 
 | |
| std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
 | |
| 
 | |
| int unicode_cpt_type(uint32_t cp);
 | |
| int unicode_cpt_type(const std::string & utf8);
 | |
| 
 | |
| std::string unicode_byte_to_utf8(uint8_t byte);
 | |
| uint8_t unicode_utf8_to_byte(const std::string & utf8);
 | |
| 
 | |
| char32_t unicode_tolower(char32_t cp);
 | |
| 
 | |
| std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
 |