/ scripts / gen-unicode-data.py
gen-unicode-data.py
  1  import regex
  2  import ctypes
  3  import unicodedata
  4  
  5  
  6  class CoodepointFlags (ctypes.Structure):
  7      _fields_ = [  # see definition in unicode.h
  8          ("is_undefined",   ctypes.c_uint16, 1),
  9          ("is_number",      ctypes.c_uint16, 1),  # regex: \p{N}
 10          ("is_letter",      ctypes.c_uint16, 1),  # regex: \p{L}
 11          ("is_separator",   ctypes.c_uint16, 1),  # regex: \p{Z}
 12          ("is_accent_mark", ctypes.c_uint16, 1),  # regex: \p{M}
 13          ("is_punctuation", ctypes.c_uint16, 1),  # regex: \p{P}
 14          ("is_symbol",      ctypes.c_uint16, 1),  # regex: \p{S}
 15          ("is_control",     ctypes.c_uint16, 1),  # regex: \p{C}
 16      ]
 17  
 18  
 19  assert (ctypes.sizeof(CoodepointFlags) == 2)
 20  
 21  
 22  MAX_CODEPOINTS = 0x110000
 23  
 24  regex_number      = regex.compile(r'\p{N}')
 25  regex_letter      = regex.compile(r'\p{L}')
 26  regex_separator   = regex.compile(r'\p{Z}')
 27  regex_accent_mark = regex.compile(r'\p{M}')
 28  regex_punctuation = regex.compile(r'\p{P}')
 29  regex_symbol      = regex.compile(r'\p{S}')
 30  regex_control     = regex.compile(r'\p{C}')
 31  regex_whitespace  = regex.compile(r'\s')
 32  
 33  codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
 34  table_whitespace = []
 35  table_lowercase = []
 36  table_uppercase = []
 37  table_nfd = []
 38  
 39  for codepoint in range(MAX_CODEPOINTS):
 40      # convert codepoint to unicode character
 41      char = chr(codepoint)
 42  
 43      # regex categories
 44      flags = codepoint_flags[codepoint]
 45      flags.is_number      = bool(regex_number.match(char))
 46      flags.is_letter      = bool(regex_letter.match(char))
 47      flags.is_separator   = bool(regex_separator.match(char))
 48      flags.is_accent_mark = bool(regex_accent_mark.match(char))
 49      flags.is_punctuation = bool(regex_punctuation.match(char))
 50      flags.is_symbol      = bool(regex_symbol.match(char))
 51      flags.is_control     = bool(regex_control.match(char))
 52      flags.is_undefined   = bytes(flags)[0] == 0
 53      assert (not flags.is_undefined)
 54  
 55      # whitespaces
 56      if bool(regex_whitespace.match(char)):
 57          table_whitespace.append(codepoint)
 58  
 59      # lowercase conversion
 60      lower = ord(char.lower()[0])
 61      if codepoint != lower:
 62          table_lowercase.append((codepoint, lower))
 63  
 64      # uppercase conversion
 65      upper = ord(char.upper()[0])
 66      if codepoint != upper:
 67          table_uppercase.append((codepoint, upper))
 68  
 69      # NFD normalization
 70      norm = ord(unicodedata.normalize('NFD', char)[0])
 71      if codepoint != norm:
 72          table_nfd.append((codepoint, norm))
 73  
 74  
 75  # group ranges with same flags
 76  ranges_flags = [(0, codepoint_flags[0])]  # start, flags
 77  for codepoint, flags in enumerate(codepoint_flags):
 78      if bytes(flags) != bytes(ranges_flags[-1][1]):
 79          ranges_flags.append((codepoint, flags))
 80  ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
 81  
 82  
 83  # group ranges with same nfd
 84  ranges_nfd = [(0, 0, 0)]  # start, last, nfd
 85  for codepoint, norm in table_nfd:
 86      start = ranges_nfd[-1][0]
 87      if ranges_nfd[-1] != (start, codepoint - 1, norm):
 88          ranges_nfd.append(None)
 89          start = codepoint
 90      ranges_nfd[-1] = (start, codepoint, norm)
 91  
 92  
 93  # Generate 'unicode-data.cpp'
 94  
 95  
 96  def out(line=""):
 97      print(line, end='\n')  # noqa
 98  
 99  
100  out("""\
101  // generated with scripts/gen-unicode-data.py
102  
103  #include "unicode-data.h"
104  
105  #include <cstdint>
106  #include <vector>
107  #include <unordered_map>
108  #include <unordered_set>
109  """)
110  
111  out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
112  for codepoint, flags in ranges_flags:
113      flags = int.from_bytes(bytes(flags), "little")
114      out("{0x%06X, 0x%04X}," % (codepoint, flags))
115  out("};\n")
116  
117  out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
118  out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
119  out("};\n")
120  
121  out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
122  for tuple in table_lowercase:
123      out("{0x%06X, 0x%06X}," % tuple)
124  out("};\n")
125  
126  out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
127  for tuple in table_uppercase:
128      out("{0x%06X, 0x%06X}," % tuple)
129  out("};\n")
130  
131  out("const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd")
132  for triple in ranges_nfd:
133      out("{0x%06X, 0x%06X, 0x%06X}," % triple)
134  out("};\n")