mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	 92139b90af
			
		
	
	92139b90af
	
	
	
		
			
			* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
		
			
				
	
	
		
			17 lines
		
	
	
		
			806 B
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			17 lines
		
	
	
		
			806 B
		
	
	
	
		
			C++
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include <cstdint>
 | |
| #include <map>
 | |
| #include <utility>
 | |
| #include <vector>
 | |
| 
 | |
| extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
 | |
| extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
 | |
| extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
 | |
| extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
 | |
| extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
 | |
| extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
 | |
| extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
 | |
| extern const std::multimap<uint32_t, uint32_t>          unicode_map_nfd;
 | |
| extern const std::map<char32_t, char32_t>               unicode_map_lowercase;
 |