/ libxml2 / genUnicode.py
genUnicode.py
  1  #!/usr/bin/python -u
  2  #
  3  # Original script modified in November 2003 to take advantage of
  4  # the character-validation range routines, and updated to the
  5  # current Unicode information (Version 4.0.1)
  6  #
  7  # NOTE: there is an 'alias' facility for blocks which are not present in
  8  #	the current release, but are needed for ABI compatibility.  This
  9  #	must be accomplished MANUALLY!  Please see the comments below under
 10  #     'blockAliases'
 11  #
 12  import sys
 13  import string
 14  import time
 15  
 16  webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
 17  sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
 18  
 19  #
 20  # blockAliases is a small hack - it is used for mapping block names which
 21  # were were used in the 3.1 release, but are missing or changed in the current
 22  # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
 23  blockAliases = []
 24  blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
 25  blockAliases.append("Greek:GreekandCoptic")
 26  blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
 27  	"SupplementaryPrivateUseArea-B")
 28  
 29  # minTableSize gives the minimum number of ranges which must be present
 30  # before a range table is produced.  If there are less than this
 31  # number, inline comparisons are generated
 32  minTableSize = 8
 33  
 34  (blockfile, catfile) = string.split(sources)
 35  
 36  
 37  #
 38  # Now process the "blocks" file, reducing it to a dictionary
 39  # indexed by blockname, containing a tuple with the applicable
 40  # block range
 41  #
 42  BlockNames = {}
 43  try:
 44      blocks = open(blockfile, "r")
 45  except:
 46      print "Missing %s, aborting ..." % blockfile
 47      sys.exit(1)
 48  
 49  for line in blocks.readlines():
 50      if line[0] == '#':
 51          continue
 52      line = string.strip(line)
 53      if line == '':
 54          continue
 55      try:
 56          fields = string.split(line, ';')
 57          range = string.strip(fields[0])
 58          (start, end) = string.split(range, "..")
 59          name = string.strip(fields[1])
 60          name = string.replace(name, ' ', '')
 61      except:
 62          print "Failed to process line: %s" % (line)
 63          continue
 64      start = "0x" + start
 65      end = "0x" + end
 66      try:
 67          BlockNames[name].append((start, end))
 68      except:
 69          BlockNames[name] = [(start, end)]
 70  blocks.close()
 71  print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
 72  
 73  for block in blockAliases:
 74      alias = string.split(block,':')
 75      alist = string.split(alias[1],',')
 76      for comp in alist:
 77          if BlockNames.has_key(comp):
 78              if alias[0] not in BlockNames:
 79                  BlockNames[alias[0]] = []
 80              for r in BlockNames[comp]:
 81                  BlockNames[alias[0]].append(r)
 82          else:
 83              print "Alias %s: %s not in Blocks" % (alias[0], comp)
 84              continue
 85  
 86  #
 87  # Next process the Categories file. This is more complex, since
 88  # the file is in code sequence, and we need to invert it.  We use
 89  # a dictionary with index category-name, with each entry containing
 90  # all the ranges (codepoints) of that category.  Note that category
 91  # names comprise two parts - the general category, and the "subclass"
 92  # within that category.  Therefore, both "general category" (which is
 93  # the first character of the 2-character category-name) and the full
 94  # (2-character) name are entered into this dictionary.
 95  #
 96  try:
 97      data = open(catfile, "r")
 98  except:
 99      print "Missing %s, aborting ..." % catfile
100      sys.exit(1)
101  
102  nbchar = 0;
103  Categories = {}
104  for line in data.readlines():
105      if line[0] == '#':
106          continue
107      line = string.strip(line)
108      if line == '':
109          continue
110      try:
111          fields = string.split(line, ';')
112          point = string.strip(fields[0])
113          value = 0
114          while point != '':
115              value = value * 16
116              if point[0] >= '0' and point[0] <= '9':
117                  value = value + ord(point[0]) - ord('0')
118              elif point[0] >= 'A' and point[0] <= 'F':
119                  value = value + 10 + ord(point[0]) - ord('A')
120              elif point[0] >= 'a' and point[0] <= 'f':
121                  value = value + 10 + ord(point[0]) - ord('a')
122              point = point[1:]
123          name = fields[2]
124      except:
125          print "Failed to process line: %s" % (line)
126          continue
127      
128      nbchar = nbchar + 1
129      # update entry for "full name"
130      try:
131          Categories[name].append(value)
132      except:
133          try:
134              Categories[name] = [value]
135          except:
136              print "Failed to process line: %s" % (line)
137      # update "general category" name
138      try:
139          Categories[name[0]].append(value)
140      except:
141          try:
142              Categories[name[0]] = [value]
143          except:
144              print "Failed to process line: %s" % (line)
145  
146  blocks.close()
147  print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
148  
149  #
150  # The data is now all read.  Time to process it into a more useful form.
151  #
152  # reduce the number list into ranges
153  for cat in Categories.keys():
154      list = Categories[cat]
155      start = -1
156      prev = -1
157      end = -1
158      ranges = []
159      for val in list:
160          if start == -1:
161              start = val
162              prev = val
163              continue
164          elif val == prev + 1:
165              prev = val
166              continue
167          elif prev == start:
168              ranges.append((prev, prev))
169              start = val
170              prev = val
171              continue
172          else:
173              ranges.append((start, prev))
174              start = val
175              prev = val
176              continue
177      if prev == start:
178          ranges.append((prev, prev))
179      else:
180          ranges.append((start, prev))
181      Categories[cat] = ranges
182  
183  #
184  # Assure all data is in alphabetic order, since we will be doing binary
185  # searches on the tables.
186  #
187  bkeys = BlockNames.keys()
188  bkeys.sort()
189  
190  ckeys = Categories.keys()
191  ckeys.sort()
192  
193  #
194  # Generate the resulting files
195  #
196  try:
197      header = open("include/libxml/xmlunicode.h", "w")
198  except:
199      print "Failed to open include/libxml/xmlunicode.h"
200      sys.exit(1)
201  
202  try:
203      output = open("xmlunicode.c", "w")
204  except:
205      print "Failed to open xmlunicode.c"
206      sys.exit(1)
207  
208  date = time.asctime(time.localtime(time.time()))
209  
210  header.write(
211  """/*
212   * Summary: Unicode character APIs
213   * Description: API for the Unicode character APIs
214   *
215   * This file is automatically generated from the
216   * UCS description files of the Unicode Character Database
217   * %s
218   * using the genUnicode.py Python script.
219   *
220   * Generation date: %s
221   * Sources: %s
222   * Author: Daniel Veillard
223   */
224  
225  #ifndef __XML_UNICODE_H__
226  #define __XML_UNICODE_H__
227  
228  #include <libxml/xmlversion.h>
229  
230  #ifdef LIBXML_UNICODE_ENABLED
231  
232  #ifdef __cplusplus
233  extern "C" {
234  #endif
235  
236  """ % (webpage, date, sources));
237  
238  output.write(
239  """/*
240   * xmlunicode.c: this module implements the Unicode character APIs
241   *
242   * This file is automatically generated from the
243   * UCS description files of the Unicode Character Database
244   * %s
245   * using the genUnicode.py Python script.
246   *
247   * Generation date: %s
248   * Sources: %s
249   * Daniel Veillard <veillard@redhat.com>
250   */
251  
252  #define IN_LIBXML
253  #include "libxml.h"
254  
255  #ifdef LIBXML_UNICODE_ENABLED
256  
257  #include <string.h>
258  #include <libxml/xmlversion.h>
259  #include <libxml/xmlunicode.h>
260  #include <libxml/chvalid.h>
261  
262  typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
263  
264  typedef struct {
265      const char *rangename;
266      xmlIntFunc *func;
267  } xmlUnicodeRange;
268  
269  typedef struct {
270      xmlUnicodeRange *table;
271      int		    numentries;
272  } xmlUnicodeNameTable;
273  
274  
275  static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
276  
277  static xmlUnicodeRange xmlUnicodeBlocks[] = {
278  """ % (webpage, date, sources));
279  
280  flag = 0
281  for block in bkeys:
282      name = string.replace(block, '-', '')
283      if flag:
284          output.write(',\n')
285      else:
286          flag = 1
287      output.write('  {"%s", xmlUCSIs%s}' % (block, name))
288  output.write('};\n\n')
289  
290  output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
291  flag = 0;
292  for name in ckeys:
293      if flag:
294          output.write(',\n')
295      else:
296          flag = 1
297      output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
298  output.write('};\n\n')
299  
300  #
301  # For any categories with more than minTableSize ranges we generate
302  # a range table suitable for xmlCharInRange
303  #
304  for name in ckeys:
305    if len(Categories[name]) > minTableSize:
306      numshort = 0
307      numlong = 0
308      ranges = Categories[name]
309      sptr = "NULL"
310      lptr = "NULL"
311      for range in ranges:
312        (low, high) = range
313        if high < 0x10000:
314          if numshort == 0:
315            pline = "static const xmlChSRange xml%sS[] = {" % name
316            sptr = "xml%sS" % name
317          else:
318            pline += ", "
319          numshort += 1
320        else:
321          if numlong == 0:
322            if numshort > 0:
323              output.write(pline + " };\n")
324            pline = "static const xmlChLRange xml%sL[] = {" % name
325            lptr = "xml%sL" % name
326          else:
327            pline += ", "
328          numlong += 1
329        if len(pline) > 60:
330          output.write(pline + "\n")
331          pline = "    "
332        pline += "{%s, %s}" % (hex(low), hex(high))
333      output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
334           % (name, numshort, numlong, sptr, lptr))
335  
336  
337  output.write(
338  """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
339  static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
340  
341  /**
342   * xmlUnicodeLookup:
343   * @tptr: pointer to the name table
344   * @name: name to be found
345   *
346   * binary table lookup for user-supplied name
347   *
348   * Returns pointer to range function if found, otherwise NULL
349   */
350  static xmlIntFunc
351  *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
352      int low, high, mid, cmp;
353      xmlUnicodeRange *sptr;
354  
355      if ((tptr == NULL) || (tname == NULL)) return(NULL);
356  
357      low = 0;
358      high = tptr->numentries - 1;
359      sptr = tptr->table;
360      while (low <= high) {
361  	mid = (low + high) / 2;
362  	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
363  	    return (sptr[mid].func);
364  	if (cmp < 0)
365  	    high = mid - 1;
366  	else
367  	    low = mid + 1;
368      }
369      return (NULL);    
370  }
371  
372  """ % (len(BlockNames), len(Categories)) )
373  
374  for block in bkeys:
375      name = string.replace(block, '-', '')
376      header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
377      output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
378      output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
379                   (block))
380      output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
381      output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
382      flag = 0
383      for (start, end) in BlockNames[block]:
384          if flag:
385              output.write(" ||\n           ")
386          else:
387              flag = 1
388          output.write("((code >= %s) && (code <= %s))" % (start, end))
389      output.write(");\n}\n\n")
390  
391  header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
392  output.write(
393  """/**
394   * xmlUCSIsBlock:
395   * @code: UCS code point
396   * @block: UCS block name
397   *
398   * Check whether the character is part of the UCS Block
399   *
400   * Returns 1 if true, 0 if false and -1 on unknown block
401   */
402  int
403  xmlUCSIsBlock(int code, const char *block) {
404      xmlIntFunc *func;
405  
406      func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
407      if (func == NULL)
408  	return (-1);
409      return (func(code));
410  }
411  
412  """)
413  
414  for name in ckeys:
415      ranges = Categories[name]
416      header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
417      output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
418      output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
419                   (name))
420      output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
421      output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
422      if len(Categories[name]) > minTableSize:
423          output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
424              % name)
425      else:
426          start = 1
427          for range in ranges:
428              (begin, end) = range;
429              if start:
430                  output.write("    return(");
431                  start = 0
432              else:
433                  output.write(" ||\n           ");
434              if (begin == end):
435                  output.write("(code == %s)" % (hex(begin)))
436              else:
437                  output.write("((code >= %s) && (code <= %s))" % (
438                           hex(begin), hex(end)))
439      output.write(");\n}\n\n")
440  
441  header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
442  output.write(
443  """/**
444   * xmlUCSIsCat:
445   * @code: UCS code point
446   * @cat: UCS Category name
447   *
448   * Check whether the character is part of the UCS Category
449   *
450   * Returns 1 if true, 0 if false and -1 on unknown category
451   */
452  int
453  xmlUCSIsCat(int code, const char *cat) {
454      xmlIntFunc *func;
455  
456      func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
457      if (func == NULL)
458  	return (-1);
459      return (func(code));
460  }
461  
462  #define bottom_xmlunicode
463  #include "elfgcchack.h"
464  #endif /* LIBXML_UNICODE_ENABLED */
465  """)
466  
467  header.write("""
468  #ifdef __cplusplus
469  }
470  #endif
471  
472  #endif /* LIBXML_UNICODE_ENABLED */
473  
474  #endif /* __XML_UNICODE_H__ */
475  """);
476  
477  header.close()
478  output.close()