/ libxml2 / genChRanges.py
genChRanges.py
  1  #!/usr/bin/python -u
  2  #
  3  # Portions of this script have been (shamelessly) stolen from the
  4  # prior work of Daniel Veillard (genUnicode.py)
  5  #
  6  # I, however, take full credit for any bugs, errors or difficulties :-)
  7  #
  8  # William Brack
  9  # October 2003
 10  #
 11  # 18 October 2003
 12  # Modified to maintain binary compatibility with previous library versions
 13  # by adding a suffix 'Q' ('quick') to the macro generated for the original,
 14  # function, and adding generation of a function (with the original name) which
 15  # instantiates the macro.
 16  #
 17  
 18  import sys
 19  import string
 20  import time
 21  
 22  #
 23  # A routine to take a list of yes/no (1, 0) values and turn it
 24  # into a list of ranges.  This will later be used to determine whether
 25  # to generate single-byte lookup tables, or inline comparisons
 26  #
 27  def makeRange(lst):
 28      ret = []
 29      pos = 0
 30      while pos < len(lst):
 31  	try:		# index generates exception if not present
 32  	    s = lst[pos:].index(1)	# look for start of next range
 33  	except:
 34  	    break			# if no more, finished
 35  	pos += s			# pointer to start of possible range
 36  	try:
 37  	    e = lst[pos:].index(0)	# look for end of range
 38  	    e += pos
 39  	except:				# if no end, set to end of list
 40  	    e = len(lst)
 41  	ret.append((pos, e-1))		# append range tuple to list
 42  	pos = e + 1			# ready to check for next range
 43      return ret
 44  
 45  sources = "chvalid.def"			# input filename
 46  
 47  # minTableSize gives the minimum number of ranges which must be present
 48  # before a 256-byte lookup table is produced.  If there are less than this
 49  # number, a macro with inline comparisons is generated
 50  minTableSize = 6
 51  
 52  # dictionary of functions, key=name, element contains char-map and range-list
 53  Functs = {}
 54  
 55  state = 0
 56  
 57  try:
 58      defines = open("chvalid.def", "r")
 59  except:
 60      print "Missing chvalid.def, aborting ..."
 61      sys.exit(1)
 62  
 63  #
 64  # The lines in the .def file have three types:-
 65  #   name:   Defines a new function block
 66  #   ur:	    Defines individual or ranges of unicode values
 67  #   end:    Indicates the end of the function block
 68  #
 69  # These lines are processed below.
 70  #
 71  for line in defines.readlines():
 72      # ignore blank lines, or lines beginning with '#'
 73      if line[0] == '#':
 74          continue
 75      line = string.strip(line)
 76      if line == '':
 77          continue
 78      # split line into space-separated fields, then split on type
 79      try:
 80          fields = string.split(line, ' ')
 81  	#
 82  	# name line:
 83  	#   validate any previous function block already ended
 84  	#   validate this function not already defined
 85  	#   initialize an entry in the function dicitonary
 86  	#	including a mask table with no values yet defined
 87  	#
 88  	if fields[0] == 'name':
 89  	    name = fields[1]
 90  	    if state != 0:
 91  		print "'name' %s found before previous name" \
 92  		      "completed" % (fields[1])
 93  		continue
 94  	    state = 1
 95  	    if Functs.has_key(name):
 96  		print "name '%s' already present - may give" \
 97  		      " wrong results" % (name)
 98  	    else:
 99  		# dict entry with two list elements (chdata, rangedata)
100  		Functs[name] = [ [], [] ]
101  		for v in range(256):
102  		    Functs[name][0].append(0)
103  	#
104  	# end line:
105  	#   validate there was a preceding function name line
106  	#   set state to show no current function active
107  	#
108  	elif fields[0] == 'end':
109  	    if state == 0:
110  		print "'end' found outside of function block"
111  		continue
112  	    state = 0
113  
114  	#
115  	# ur line:
116  	#   validate function has been defined
117  	#   process remaining fields on the line, which may be either
118  	#	individual unicode values or ranges of values
119  	#
120  	elif fields[0] == 'ur':
121  	    if state != 1:
122  		raise ValidationError, "'ur' found outside of 'name' block"
123  	    for el in fields[1:]:
124  		pos = string.find(el, '..')
125  		# pos <=0 means not a range, so must be individual value
126  		if pos <= 0:
127  		    # cheap handling of hex or decimal values
128  		    if el[0:2] == '0x':
129  		        value = int(el[2:],16)
130  		    elif el[0] == "'":
131  			value = ord(el[1])
132  		    else:
133  			value = int(el)
134  		    if ((value < 0) | (value > 0x1fffff)):
135  			raise ValidationError, 'Illegal value (%s) in ch for'\
136  				' name %s' % (el,name)
137  		    # for ur we have only ranges (makes things simpler),
138  		    # so convert val to range
139  		    currange = (value, value)
140  		# pos > 0 means this is a range, so isolate/validate
141  		# the interval
142  		else:
143  		    # split the range into it's first-val, last-val
144  		    (first, last) = string.split(el, "..")
145  		    # convert values from text into binary
146  		    if first[0:2] == '0x':	
147  			start = int(first[2:],16)
148  		    elif first[0] == "'":
149  			start = ord(first[1])
150  		    else:
151  			start = int(first)
152  		    if last[0:2] == '0x':
153  			end = int(last[2:],16)
154  		    elif last[0] == "'":
155  			end = ord(last[1])
156  		    else:
157  			end = int(last)
158  		    if (start < 0) | (end > 0x1fffff) | (start > end):
159  			raise ValidationError, "Invalid range '%s'" % el
160  		    currange = (start, end)
161  		# common path - 'currange' has the range, now take care of it
162  		# We split on single-byte values vs. multibyte
163  		if currange[1] < 0x100:	# single-byte
164  		    for ch in range(currange[0],currange[1]+1):
165  			# validate that value not previously defined
166  			if Functs[name][0][ch]:
167  			    msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
168  			    raise ValidationError, msg
169  			Functs[name][0][ch] = 1
170  		else:			# multi-byte
171  		    if currange in Functs[name][1]:
172  			raise ValidationError, "range already defined in" \
173  				" function"
174  		    else:
175  			Functs[name][1].append(currange)
176  
177      except:
178  	print "Failed to process line: %s" % (line)
179  	raise
180  #
181  # At this point, the entire definition file has been processed.  Now we
182  # enter the output phase, where we generate the two files chvalid.c and'
183  # chvalid.h
184  #
185  # To do this, we first output the 'static' data (heading, fixed
186  # definitions, etc.), then output the 'dynamic' data (the results
187  # of the above processing), and finally output closing 'static' data
188  # (e.g. the subroutine to process the ranges)
189  #
190  
191  #
192  # Generate the headings:
193  #
194  try:
195      header = open("include/libxml/chvalid.h", "w")
196  except:
197      print "Failed to open include/libxml/chvalid.h"
198      sys.exit(1)
199  
200  try:
201      output = open("chvalid.c", "w")
202  except:
203      print "Failed to open chvalid.c"
204      sys.exit(1)
205  
206  date = time.asctime(time.localtime(time.time()))
207  
208  header.write(
209  """/*
210   * Summary: Unicode character range checking
211   * Description: this module exports interfaces for the character
212   *               range validation APIs
213   *
214   * This file is automatically generated from the cvs source
215   * definition files using the genChRanges.py Python script
216   *
217   * Generation date: %s
218   * Sources: %s
219   * Author: William Brack <wbrack@mmm.com.hk>
220   */
221  
222  #ifndef __XML_CHVALID_H__
223  #define __XML_CHVALID_H__
224  
225  #include <libxml/xmlversion.h>
226  #include <libxml/xmlstring.h>
227  
228  #ifdef __cplusplus
229  extern "C" {
230  #endif
231  
232  /*
233   * Define our typedefs and structures
234   *
235   */
236  typedef struct _xmlChSRange xmlChSRange;
237  typedef xmlChSRange *xmlChSRangePtr;
238  struct _xmlChSRange {
239      unsigned short	low;
240      unsigned short	high;
241  };
242  
243  typedef struct _xmlChLRange xmlChLRange;
244  typedef xmlChLRange *xmlChLRangePtr;
245  struct _xmlChLRange {
246      unsigned int	low;
247      unsigned int	high;
248  };
249  
250  typedef struct _xmlChRangeGroup xmlChRangeGroup;
251  typedef xmlChRangeGroup *xmlChRangeGroupPtr;
252  struct _xmlChRangeGroup {
253      int			nbShortRange;
254      int			nbLongRange;
255      const xmlChSRange	*shortRange;	/* points to an array of ranges */
256      const xmlChLRange	*longRange;
257  };
258  
259  /**
260   * Range checking routine
261   */
262  XMLPUBFUN int XMLCALL
263  		xmlCharInRange(unsigned int val, const xmlChRangeGroup *group);
264  
265  """ % (date, sources));
266  output.write(
267  """/*
268   * chvalid.c:	this module implements the character range
269   *		validation APIs
270   *
271   * This file is automatically generated from the cvs source
272   * definition files using the genChRanges.py Python script
273   *
274   * Generation date: %s
275   * Sources: %s
276   * William Brack <wbrack@mmm.com.hk>
277   */
278  
279  #define IN_LIBXML
280  #include "libxml.h"
281  #include <libxml/chvalid.h>
282  
283  /*
284   * The initial tables ({func_name}_tab) are used to validate whether a
285   * single-byte character is within the specified group.  Each table
286   * contains 256 bytes, with each byte representing one of the 256
287   * possible characters.  If the table byte is set, the character is
288   * allowed.
289   *
290   */
291  """ % (date, sources));
292  
293  #
294  # Now output the generated data.
295  # We try to produce the best execution times.  Tests have shown that validation
296  # with direct table lookup is, when there are a "small" number of valid items,
297  # still not as fast as a sequence of inline compares.  So, if the single-byte
298  # portion of a range has a "small" number of ranges, we output a macro for inline
299  # compares, otherwise we output a 256-byte table and a macro to use it.
300  #
301  
302  fkeys = Functs.keys()	# Dictionary of all defined functions
303  fkeys.sort()		# Put some order to our output
304  
305  for f in fkeys:
306  
307  # First we convert the specified single-byte values into a group of ranges.
308  # If the total number of such ranges is less than minTableSize, we generate
309  # an inline macro for direct comparisons; if greater, we generate a lookup
310  # table.
311      if max(Functs[f][0]) > 0:	# only check if at least one entry
312          rangeTable = makeRange(Functs[f][0])
313  	numRanges = len(rangeTable)
314  	if numRanges >= minTableSize:	# table is worthwhile
315  	    header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
316  	    header.write("""
317  /**
318   * %s_ch:
319   * @c: char to validate
320   *
321   * Automatically generated by genChRanges.py
322   */
323  """ % f)
324  	    header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
325  
326  	    # write the constant data to the code file
327  	    output.write("const unsigned char %s_tab[256] = {\n" % f)
328  	    pline = "   "
329  	    for n in range(255):
330  		pline += " 0x%02x," % Functs[f][0][n]
331  		if len(pline) > 72:
332  		    output.write(pline + "\n")
333  		    pline = "   "
334  	    output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])
335  
336  	else:		# inline check is used
337  	    # first another little optimisation - if space is present,
338  	    # put it at the front of the list so it is checked first
339  	    try:
340  		ix = rangeTable.remove((0x20, 0x20))
341  		rangeTable.insert(0, (0x20, 0x20))
342  	    except:
343  		pass
344  	    firstFlag = 1
345  	    
346  	    header.write("""
347  /**
348   * %s_ch:
349   * @c: char to validate
350   *
351   * Automatically generated by genChRanges.py
352   */
353  """ % f)
354  	    # okay, I'm tired of the messy lineup - let's automate it!
355  	    pline = "#define %s_ch(c)" % f
356  	    # 'ntab' is number of tabs needed to position to col. 33 from name end
357  	    ntab = 4 - (len(pline)) / 8
358  	    if ntab < 0:
359  		ntab = 0
360  	    just = ""
361  	    for i in range(ntab):
362  		just += "\t"
363  	    pline = pline + just + "("
364  	    for rg in rangeTable:
365  		if not firstFlag:
366  		    pline += " || \\\n\t\t\t\t "
367  		else:
368  		    firstFlag = 0
369  		if rg[0] == rg[1]:		# single value - check equal
370  		    pline += "((c) == 0x%x)" % rg[0]
371  		else:				# value range
372  		# since we are doing char, also change range ending in 0xff
373  		    if rg[1] != 0xff:
374  		        pline += "((0x%x <= (c)) &&" % rg[0]
375  		        pline += " ((c) <= 0x%x))" % rg[1]
376  		    else:
377  		        pline += " (0x%x <= (c))" % rg[0]
378  	    pline += ")\n"
379  	    header.write(pline)
380  
381      header.write("""
382  /**
383   * %sQ:
384   * @c: char to validate
385   *
386   * Automatically generated by genChRanges.py
387   */
388  """ % f)
389      pline = "#define %sQ(c)" % f
390      ntab = 4 - (len(pline)) / 8
391      if ntab < 0:
392  	ntab = 0
393      just = ""
394      for i in range(ntab):
395  	just += "\t"
396      header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
397      if max(Functs[f][0]) > 0:
398  	header.write("%s_ch((c)) :" % f)
399      else:
400  	header.write("0 :")
401  
402      # if no ranges defined, value invalid if >= 0x100
403      numRanges = len(Functs[f][1])
404      if numRanges == 0:
405  	header.write(" 0)\n\n")
406      else:
407  	if numRanges >= minTableSize:
408  	    header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
409  	else:		# if < minTableSize, generate inline code
410  	    firstFlag = 1
411  	    for rg in Functs[f][1]:
412  		if not firstFlag:
413  		    pline += " || \\\n\t\t\t\t "
414  		else:
415  		    firstFlag = 0
416  		    pline = "\\\n\t\t\t\t("
417  		if rg[0] == rg[1]:		# single value - check equal
418  		    pline += "((c) == 0x%x)" % rg[0]
419  		else:				# value range
420  		    pline += "((0x%x <= (c)) &&" % rg[0]
421  		    pline += " ((c) <= 0x%x))" % rg[1]
422  	    pline += "))\n\n"
423  	    header.write(pline)
424  
425  
426      if len(Functs[f][1]) > 0:
427  	header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
428  
429  
430  #
431  # Next we do the unicode ranges
432  #
433  
434  for f in fkeys:
435      if len(Functs[f][1]) > 0:	# only generate if unicode ranges present
436  	rangeTable = Functs[f][1]
437  	rangeTable.sort()	# ascending tuple sequence
438  	numShort = 0
439  	numLong  = 0
440  	for rg in rangeTable:
441  	    if rg[1] < 0x10000:	# if short value
442  		if numShort == 0:	# first occurence
443  		    pline = "static const xmlChSRange %s_srng[] = { " % f
444  		else:
445  		    pline += ", "
446  		numShort += 1
447  		if len(pline) > 60:
448  		    output.write(pline + "\n")
449  		    pline = "    "
450  		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
451  	    else:		# if long value
452  		if numLong == 0:	# first occurence
453  		    if numShort > 0:	# if there were shorts, finish them off
454  			output.write(pline + "};\n")
455  		    pline = "static const xmlChLRange %s_lrng[] = { " % f
456  		else:
457  		    pline += ", "
458  		numLong += 1
459  		if len(pline) > 60:
460  		    output.write(pline + "\n")
461  		    pline = "    "
462  		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
463  	output.write(pline + "};\n")	# finish off last group
464  
465  	pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
466  	if numShort > 0:
467  	    pline += "%s_srng" % f
468  	else:
469  	    pline += "(xmlChSRangePtr)0"
470  	if numLong > 0:
471  	    pline += ", %s_lrng" % f
472  	else:
473  	    pline += ", (xmlChLRangePtr)0"
474  	
475  	output.write(pline + "};\n\n")
476  
477  output.write(
478  """
479  /**
480   * xmlCharInRange:
481   * @val: character to be validated
482   * @rptr: pointer to range to be used to validate
483   *
484   * Does a binary search of the range table to determine if char
485   * is valid
486   *
487   * Returns: true if character valid, false otherwise
488   */
489  int
490  xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
491      int low, high, mid;
492      const xmlChSRange *sptr;
493      const xmlChLRange *lptr;
494  
495      if (rptr == NULL) return(0);
496      if (val < 0x10000) {	/* is val in 'short' or 'long'  array? */
497  	if (rptr->nbShortRange == 0)
498  	    return 0;
499  	low = 0;
500  	high = rptr->nbShortRange - 1;
501  	sptr = rptr->shortRange;
502  	while (low <= high) {
503  	    mid = (low + high) / 2;
504  	    if ((unsigned short) val < sptr[mid].low) {
505  		high = mid - 1;
506  	    } else {
507  	        if ((unsigned short) val > sptr[mid].high) {
508  		    low = mid + 1;
509  		} else {
510  		    return 1;
511  		}
512  	    }
513  	}
514      } else {
515  	if (rptr->nbLongRange == 0) {
516  	    return 0;
517  	}
518  	low = 0;
519  	high = rptr->nbLongRange - 1;
520  	lptr = rptr->longRange;
521  	while (low <= high) {
522  	    mid = (low + high) / 2;
523  	    if (val < lptr[mid].low) {
524  		high = mid - 1;
525  	    } else {
526  	        if (val > lptr[mid].high) {
527  		    low = mid + 1;
528  		} else {
529  		    return 1;
530  		}
531  	    }
532  	}
533      }
534      return 0;
535  }
536  
537  """);
538  
539  #
540  # finally, generate the ABI compatibility functions
541  #
542  for f in fkeys:
543      output.write("""
544  /**
545   * %s:
546   * @ch:  character to validate
547   *
548   * This function is DEPRECATED.
549  """ % f);
550      if max(Functs[f][0]) > 0:
551          output.write(" * Use %s_ch or %sQ instead" % (f, f))
552      else:
553          output.write(" * Use %sQ instead" % f)
554      output.write("""
555   *
556   * Returns true if argument valid, false otherwise
557   */
558  """)
559      output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
560      header.write("XMLPUBFUN int XMLCALL\n\t\t%s(unsigned int ch);\n" % f);
561  #
562  # Run complete - write trailers and close the output files
563  #
564  
565  header.write("""
566  #ifdef __cplusplus
567  }
568  #endif
569  #endif /* __XML_CHVALID_H__ */
570  """)
571  
572  header.close()
573  
574  output.write("""#define bottom_chvalid
575  #include "elfgcchack.h"
576  """)
577  output.close()
578