mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	 b43272afa2
			
		
	
	b43272afa2
	
	
	
		
			
			* Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM
		
			
				
	
	
		
			135 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			135 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import regex
 | |
| import ctypes
 | |
| import unicodedata
 | |
| 
 | |
| 
 | |
| class CoodepointFlags (ctypes.Structure):
 | |
|     _fields_ = [  # see definition in unicode.h
 | |
|         ("is_undefined",   ctypes.c_uint16, 1),
 | |
|         ("is_number",      ctypes.c_uint16, 1),  # regex: \p{N}
 | |
|         ("is_letter",      ctypes.c_uint16, 1),  # regex: \p{L}
 | |
|         ("is_separator",   ctypes.c_uint16, 1),  # regex: \p{Z}
 | |
|         ("is_accent_mark", ctypes.c_uint16, 1),  # regex: \p{M}
 | |
|         ("is_punctuation", ctypes.c_uint16, 1),  # regex: \p{P}
 | |
|         ("is_symbol",      ctypes.c_uint16, 1),  # regex: \p{S}
 | |
|         ("is_control",     ctypes.c_uint16, 1),  # regex: \p{C}
 | |
|     ]
 | |
| 
 | |
| 
 | |
| assert (ctypes.sizeof(CoodepointFlags) == 2)
 | |
| 
 | |
| 
 | |
| MAX_CODEPOINTS = 0x110000
 | |
| 
 | |
| regex_number      = regex.compile(r'\p{N}')
 | |
| regex_letter      = regex.compile(r'\p{L}')
 | |
| regex_separator   = regex.compile(r'\p{Z}')
 | |
| regex_accent_mark = regex.compile(r'\p{M}')
 | |
| regex_punctuation = regex.compile(r'\p{P}')
 | |
| regex_symbol      = regex.compile(r'\p{S}')
 | |
| regex_control     = regex.compile(r'\p{C}')
 | |
| regex_whitespace  = regex.compile(r'\s')
 | |
| 
 | |
| codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
 | |
| table_whitespace = []
 | |
| table_lowercase = []
 | |
| table_uppercase = []
 | |
| table_nfd = []
 | |
| 
 | |
| for codepoint in range(MAX_CODEPOINTS):
 | |
|     # convert codepoint to unicode character
 | |
|     char = chr(codepoint)
 | |
| 
 | |
|     # regex categories
 | |
|     flags = codepoint_flags[codepoint]
 | |
|     flags.is_number      = bool(regex_number.match(char))
 | |
|     flags.is_letter      = bool(regex_letter.match(char))
 | |
|     flags.is_separator   = bool(regex_separator.match(char))
 | |
|     flags.is_accent_mark = bool(regex_accent_mark.match(char))
 | |
|     flags.is_punctuation = bool(regex_punctuation.match(char))
 | |
|     flags.is_symbol      = bool(regex_symbol.match(char))
 | |
|     flags.is_control     = bool(regex_control.match(char))
 | |
|     flags.is_undefined   = bytes(flags)[0] == 0
 | |
|     assert (not flags.is_undefined)
 | |
| 
 | |
|     # whitespaces
 | |
|     if bool(regex_whitespace.match(char)):
 | |
|         table_whitespace.append(codepoint)
 | |
| 
 | |
|     # lowercase conversion
 | |
|     lower = ord(char.lower()[0])
 | |
|     if codepoint != lower:
 | |
|         table_lowercase.append((codepoint, lower))
 | |
| 
 | |
|     # uppercase conversion
 | |
|     upper = ord(char.upper()[0])
 | |
|     if codepoint != upper:
 | |
|         table_uppercase.append((codepoint, upper))
 | |
| 
 | |
|     # NFD normalization
 | |
|     norm = ord(unicodedata.normalize('NFD', char)[0])
 | |
|     if codepoint != norm:
 | |
|         table_nfd.append((codepoint, norm))
 | |
| 
 | |
| 
 | |
| # group ranges with same flags
 | |
| ranges_flags = [(0, codepoint_flags[0])]  # start, flags
 | |
| for codepoint, flags in enumerate(codepoint_flags):
 | |
|     if bytes(flags) != bytes(ranges_flags[-1][1]):
 | |
|         ranges_flags.append((codepoint, flags))
 | |
| ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
 | |
| 
 | |
| 
 | |
| # group ranges with same nfd
 | |
| ranges_nfd = [(0, 0, 0)]  # start, last, nfd
 | |
| for codepoint, norm in table_nfd:
 | |
|     start = ranges_nfd[-1][0]
 | |
|     if ranges_nfd[-1] != (start, codepoint - 1, norm):
 | |
|         ranges_nfd.append(None)
 | |
|         start = codepoint
 | |
|     ranges_nfd[-1] = (start, codepoint, norm)
 | |
| 
 | |
| 
 | |
| # Generate 'unicode-data.cpp'
 | |
| 
 | |
| 
 | |
| def out(line=""):
 | |
|     print(line, end='\n')  # noqa
 | |
| 
 | |
| 
 | |
| out("""\
 | |
| // generated with scripts/gen-unicode-data.py
 | |
| 
 | |
| #include "unicode-data.h"
 | |
| 
 | |
| #include <cstdint>
 | |
| #include <vector>
 | |
| #include <unordered_map>
 | |
| #include <unordered_set>
 | |
| """)
 | |
| 
 | |
| out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
 | |
| for codepoint, flags in ranges_flags:
 | |
|     flags = int.from_bytes(bytes(flags), "little")
 | |
|     out("{0x%06X, 0x%04X}," % (codepoint, flags))
 | |
| out("};\n")
 | |
| 
 | |
| out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
 | |
| out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
 | |
| out("};\n")
 | |
| 
 | |
| out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
 | |
| for tuple in table_lowercase:
 | |
|     out("{0x%06X, 0x%06X}," % tuple)
 | |
| out("};\n")
 | |
| 
 | |
| out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
 | |
| for tuple in table_uppercase:
 | |
|     out("{0x%06X, 0x%06X}," % tuple)
 | |
| out("};\n")
 | |
| 
 | |
| out("const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd")
 | |
| for triple in ranges_nfd:
 | |
|     out("{0x%06X, 0x%06X, 0x%06X}," % triple)
 | |
| out("};\n")
 |