/ externals / fmt / support / printable.py
printable.py
  1  #!/usr/bin/env python3
  2  
  3  # This script is based on
  4  # https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py
  5  # distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.
  6  
  7  # This script uses the following Unicode tables:
  8  # - UnicodeData.txt
  9  
 10  
 11  from collections import namedtuple
 12  import csv
 13  import os
 14  import subprocess
 15  
 16  NUM_CODEPOINTS=0x110000
 17  
 18  def to_ranges(iter):
 19      current = None
 20      for i in iter:
 21          if current is None or i != current[1] or i in (0x10000, 0x20000):
 22              if current is not None:
 23                  yield tuple(current)
 24              current = [i, i + 1]
 25          else:
 26              current[1] += 1
 27      if current is not None:
 28          yield tuple(current)
 29  
 30  def get_escaped(codepoints):
 31      for c in codepoints:
 32          if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
 33              yield c.value
 34  
 35  def get_file(f):
 36      try:
 37          return open(os.path.basename(f))
 38      except FileNotFoundError:
 39          subprocess.run(["curl", "-O", f], check=True)
 40          return open(os.path.basename(f))
 41  
 42  Codepoint = namedtuple('Codepoint', 'value class_')
 43  
 44  def get_codepoints(f):
 45      r = csv.reader(f, delimiter=";")
 46      prev_codepoint = 0
 47      class_first = None
 48      for row in r:
 49          codepoint = int(row[0], 16)
 50          name = row[1]
 51          class_ = row[2]
 52  
 53          if class_first is not None:
 54              if not name.endswith("Last>"):
 55                  raise ValueError("Missing Last after First")
 56  
 57          for c in range(prev_codepoint + 1, codepoint):
 58              yield Codepoint(c, class_first)
 59  
 60          class_first = None
 61          if name.endswith("First>"):
 62              class_first = class_
 63  
 64          yield Codepoint(codepoint, class_)
 65          prev_codepoint = codepoint
 66  
 67      if class_first is not None:
 68          raise ValueError("Missing Last after First")
 69  
 70      for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
 71          yield Codepoint(c, None)
 72  
 73  def compress_singletons(singletons):
 74      uppers = [] # (upper, # items in lowers)
 75      lowers = []
 76  
 77      for i in singletons:
 78          upper = i >> 8
 79          lower = i & 0xff
 80          if len(uppers) == 0 or uppers[-1][0] != upper:
 81              uppers.append((upper, 1))
 82          else:
 83              upper, count = uppers[-1]
 84              uppers[-1] = upper, count + 1
 85          lowers.append(lower)
 86  
 87      return uppers, lowers
 88  
 89  def compress_normal(normal):
 90      # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
 91      # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
 92      compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
 93  
 94      prev_start = 0
 95      for start, count in normal:
 96          truelen = start - prev_start
 97          falselen = count
 98          prev_start = start + count
 99  
100          assert truelen < 0x8000 and falselen < 0x8000
101          entry = []
102          if truelen > 0x7f:
103              entry.append(0x80 | (truelen >> 8))
104              entry.append(truelen & 0xff)
105          else:
106              entry.append(truelen & 0x7f)
107          if falselen > 0x7f:
108              entry.append(0x80 | (falselen >> 8))
109              entry.append(falselen & 0xff)
110          else:
111              entry.append(falselen & 0x7f)
112  
113          compressed.append(entry)
114  
115      return compressed
116  
117  def print_singletons(uppers, lowers, uppersname, lowersname):
118      print("  static constexpr singleton {}[] = {{".format(uppersname))
119      for u, c in uppers:
120          print("    {{{:#04x}, {}}},".format(u, c))
121      print("  };")
122      print("  static constexpr unsigned char {}[] = {{".format(lowersname))
123      for i in range(0, len(lowers), 8):
124          print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
125      print("  };")
126  
127  def print_normal(normal, normalname):
128      print("  static constexpr unsigned char {}[] = {{".format(normalname))
129      for v in normal:
130          print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))
131      print("  };")
132  
133  def main():
134      file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
135  
136      codepoints = get_codepoints(file)
137  
138      CUTOFF=0x10000
139      singletons0 = []
140      singletons1 = []
141      normal0 = []
142      normal1 = []
143      extra = []
144  
145      for a, b in to_ranges(get_escaped(codepoints)):
146          if a > 2 * CUTOFF:
147              extra.append((a, b - a))
148          elif a == b - 1:
149              if a & CUTOFF:
150                  singletons1.append(a & ~CUTOFF)
151              else:
152                  singletons0.append(a)
153          elif a == b - 2:
154              if a & CUTOFF:
155                  singletons1.append(a & ~CUTOFF)
156                  singletons1.append((a + 1) & ~CUTOFF)
157              else:
158                  singletons0.append(a)
159                  singletons0.append(a + 1)
160          else:
161              if a >= 2 * CUTOFF:
162                  extra.append((a, b - a))
163              elif a & CUTOFF:
164                  normal1.append((a & ~CUTOFF, b - a))
165              else:
166                  normal0.append((a, b - a))
167  
168      singletons0u, singletons0l = compress_singletons(singletons0)
169      singletons1u, singletons1l = compress_singletons(singletons1)
170      normal0 = compress_normal(normal0)
171      normal1 = compress_normal(normal1)
172  
173      print("""\
174  FMT_FUNC auto is_printable(uint32_t cp) -> bool {\
175  """)
176      print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')
177      print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')
178      print_normal(normal0, 'normal0')
179      print_normal(normal1, 'normal1')
180      print("""\
181    auto lower = static_cast<uint16_t>(cp);
182    if (cp < 0x10000) {
183      return is_printable(lower, singletons0,
184                          sizeof(singletons0) / sizeof(*singletons0),
185                          singletons0_lower, normal0, sizeof(normal0));
186    }
187    if (cp < 0x20000) {
188      return is_printable(lower, singletons1,
189                          sizeof(singletons1) / sizeof(*singletons1),
190                          singletons1_lower, normal1, sizeof(normal1));
191    }\
192  """)
193      for a, b in extra:
194          print("  if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))
195      print("""\
196    return cp < 0x{:x};
197  }}\
198  """.format(NUM_CODEPOINTS))
199  
200  if __name__ == '__main__':
201      main()