gen-unicode-data.py
1 import regex 2 import ctypes 3 import unicodedata 4 5 6 class CoodepointFlags (ctypes.Structure): 7 _fields_ = [ # see definition in unicode.h 8 ("is_undefined", ctypes.c_uint16, 1), 9 ("is_number", ctypes.c_uint16, 1), # regex: \p{N} 10 ("is_letter", ctypes.c_uint16, 1), # regex: \p{L} 11 ("is_separator", ctypes.c_uint16, 1), # regex: \p{Z} 12 ("is_accent_mark", ctypes.c_uint16, 1), # regex: \p{M} 13 ("is_punctuation", ctypes.c_uint16, 1), # regex: \p{P} 14 ("is_symbol", ctypes.c_uint16, 1), # regex: \p{S} 15 ("is_control", ctypes.c_uint16, 1), # regex: \p{C} 16 ] 17 18 19 assert (ctypes.sizeof(CoodepointFlags) == 2) 20 21 22 MAX_CODEPOINTS = 0x110000 23 24 regex_number = regex.compile(r'\p{N}') 25 regex_letter = regex.compile(r'\p{L}') 26 regex_separator = regex.compile(r'\p{Z}') 27 regex_accent_mark = regex.compile(r'\p{M}') 28 regex_punctuation = regex.compile(r'\p{P}') 29 regex_symbol = regex.compile(r'\p{S}') 30 regex_control = regex.compile(r'\p{C}') 31 regex_whitespace = regex.compile(r'\s') 32 33 codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)() 34 table_whitespace = [] 35 table_lowercase = [] 36 table_uppercase = [] 37 table_nfd = [] 38 39 for codepoint in range(MAX_CODEPOINTS): 40 # convert codepoint to unicode character 41 char = chr(codepoint) 42 43 # regex categories 44 flags = codepoint_flags[codepoint] 45 flags.is_number = bool(regex_number.match(char)) 46 flags.is_letter = bool(regex_letter.match(char)) 47 flags.is_separator = bool(regex_separator.match(char)) 48 flags.is_accent_mark = bool(regex_accent_mark.match(char)) 49 flags.is_punctuation = bool(regex_punctuation.match(char)) 50 flags.is_symbol = bool(regex_symbol.match(char)) 51 flags.is_control = bool(regex_control.match(char)) 52 flags.is_undefined = bytes(flags)[0] == 0 53 assert (not flags.is_undefined) 54 55 # whitespaces 56 if bool(regex_whitespace.match(char)): 57 table_whitespace.append(codepoint) 58 59 # lowercase conversion 60 lower = ord(char.lower()[0]) 61 if codepoint != lower: 62 table_lowercase.append((codepoint, lower)) 63 64 # uppercase conversion 65 upper = ord(char.upper()[0]) 66 if codepoint != upper: 67 table_uppercase.append((codepoint, upper)) 68 69 # NFD normalization 70 norm = ord(unicodedata.normalize('NFD', char)[0]) 71 if codepoint != norm: 72 table_nfd.append((codepoint, norm)) 73 74 75 # group ranges with same flags 76 ranges_flags = [(0, codepoint_flags[0])] # start, flags 77 for codepoint, flags in enumerate(codepoint_flags): 78 if bytes(flags) != bytes(ranges_flags[-1][1]): 79 ranges_flags.append((codepoint, flags)) 80 ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags())) 81 82 83 # group ranges with same nfd 84 ranges_nfd = [(0, 0, 0)] # start, last, nfd 85 for codepoint, norm in table_nfd: 86 start = ranges_nfd[-1][0] 87 if ranges_nfd[-1] != (start, codepoint - 1, norm): 88 ranges_nfd.append(None) 89 start = codepoint 90 ranges_nfd[-1] = (start, codepoint, norm) 91 92 93 # Generate 'unicode-data.cpp' 94 95 96 def out(line=""): 97 print(line, end='\n') # noqa 98 99 100 out("""\ 101 // generated with scripts/gen-unicode-data.py 102 103 #include "unicode-data.h" 104 105 #include <cstdint> 106 #include <vector> 107 #include <unordered_map> 108 #include <unordered_set> 109 """) 110 111 out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1") 112 for codepoint, flags in ranges_flags: 113 flags = int.from_bytes(bytes(flags), "little") 114 out("{0x%06X, 0x%04X}," % (codepoint, flags)) 115 out("};\n") 116 117 out("const std::unordered_set<uint32_t> unicode_set_whitespace = {") 118 out(", ".join("0x%06X" % cpt for cpt in table_whitespace)) 119 out("};\n") 120 121 out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {") 122 for tuple in table_lowercase: 123 out("{0x%06X, 0x%06X}," % tuple) 124 out("};\n") 125 126 out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {") 127 for tuple in table_uppercase: 128 out("{0x%06X, 0x%06X}," % tuple) 129 out("};\n") 130 131 out("const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd") 132 for triple in ranges_nfd: 133 out("{0x%06X, 0x%06X, 0x%06X}," % triple) 134 out("};\n")