mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	 b43272afa2
			
		
	
	b43272afa2
	
	
	
		
			
			* Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM
		
			
				
	
	
		
			64 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			64 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include <cstdint>
 | |
| #include <string>
 | |
| #include <vector>
 | |
| 
 | |
| struct codepoint_flags {
 | |
|     enum {
 | |
|         UNDEFINED       = 0x0001,
 | |
|         NUMBER          = 0x0002,  // regex: \p{N}
 | |
|         LETTER          = 0x0004,  // regex: \p{L}
 | |
|         SEPARATOR       = 0x0008,  // regex: \p{Z}
 | |
|         ACCENT_MARK     = 0x0010,  // regex: \p{M}
 | |
|         PUNCTUATION     = 0x0020,  // regex: \p{P}
 | |
|         SYMBOL          = 0x0040,  // regex: \p{S}
 | |
|         CONTROL         = 0x0080,  // regex: \p{C}
 | |
|         MASK_CATEGORIES = 0x00FF,
 | |
|     };
 | |
| 
 | |
|     // codepoint type
 | |
|     uint16_t is_undefined   : 1;
 | |
|     uint16_t is_number      : 1;  // regex: \p{N}
 | |
|     uint16_t is_letter      : 1;  // regex: \p{L}
 | |
|     uint16_t is_separator   : 1;  // regex: \p{Z}
 | |
|     uint16_t is_accent_mark : 1;  // regex: \p{M}
 | |
|     uint16_t is_punctuation : 1;  // regex: \p{P}
 | |
|     uint16_t is_symbol      : 1;  // regex: \p{S}
 | |
|     uint16_t is_control     : 1;  // regex: \p{C}
 | |
|     // helper flags
 | |
|     uint16_t is_whitespace  : 1;  // regex: \s
 | |
|     uint16_t is_lowercase   : 1;
 | |
|     uint16_t is_uppercase   : 1;
 | |
|     uint16_t is_nfd         : 1;
 | |
| 
 | |
|     // decode from uint16
 | |
|     inline codepoint_flags(const uint16_t flags=0) {
 | |
|         *reinterpret_cast<uint16_t*>(this) = flags;
 | |
|     }
 | |
| 
 | |
|     inline uint16_t as_uint() const {
 | |
|         return *reinterpret_cast<const uint16_t*>(this);
 | |
|     }
 | |
| 
 | |
|     inline uint16_t category_flag() const {
 | |
|         return this->as_uint() & MASK_CATEGORIES;
 | |
|     }
 | |
| };
 | |
| 
 | |
| 
 | |
| std::string unicode_cpt_to_utf8(uint32_t cp);
 | |
| std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 | |
| 
 | |
| std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
 | |
| 
 | |
| codepoint_flags unicode_cpt_flags(const uint32_t cp);
 | |
| codepoint_flags unicode_cpt_flags(const std::string & utf8);
 | |
| 
 | |
| std::string unicode_byte_to_utf8(uint8_t byte);
 | |
| uint8_t unicode_utf8_to_byte(const std::string & utf8);
 | |
| 
 | |
| char32_t unicode_tolower(char32_t cp);
 | |
| 
 | |
| std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
 |