mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	 b43272afa2
			
		
	
	b43272afa2
	
	
	
		
			
			* Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM
		
			
				
	
	
		
			21 lines
		
	
	
		
			582 B
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			21 lines
		
	
	
		
			582 B
		
	
	
	
		
			C++
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include <cstdint>
 | |
| #include <vector>
 | |
| #include <unordered_map>
 | |
| #include <unordered_set>
 | |
| 
 | |
| struct range_nfd {
 | |
|     uint32_t first;
 | |
|     uint32_t last;
 | |
|     uint32_t nfd;
 | |
| };
 | |
| 
 | |
| static const uint32_t MAX_CODEPOINTS = 0x110000;
 | |
| 
 | |
| extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
 | |
| extern const std::unordered_set<uint32_t> unicode_set_whitespace;
 | |
| extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
 | |
| extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
 | |
| extern const std::vector<range_nfd> unicode_ranges_nfd;
 |