#!/usr/bin/python -u # # Portions of this script have been (shamelessly) stolen from the # prior work of Daniel Veillard (genUnicode.py) # # I, however, take full credit for any bugs, errors or difficulties :-) # # William Brack # October 2003 # import sys import string import time # # A little routine to assign a 'meaningful' name to a range # def rangename( intvl ): (start, end) = intvl rname = "r" + hex(start)[2:] + "x" + hex(end)[2:] return rname # # A routine to take a list of yes/no (1, 0) values and turn it # into a list of ranges. This will later be used to determine whether # to generate single-byte lookup tables, or inline comparisons # def makeRange(lst): ret = [] pos = 0 while pos < len(lst): try: # index generates exception if not present s = lst[pos:].index(1) # look for start of next range except: break # if no more, finished pos += s # pointer to start of possible range try: e = lst[pos:].index(0) # look for end of range e += pos except: # if no end, set to end of list e = len(lst) ret.append((pos, e-1)) # append range tuple to list pos = e + 1 # ready to check for next range return ret sources = "chvalid.def" # input filename # minTableSize gives the minimum number of ranges which must be present # before a 256-byte lookup table is produced. If there are less than this # number, a macro with inline comparisons is generated minTableSize = 6 # dictionary of ranges, key=range, element contains list of funcs using it Ranges = {} # dictionary of functions, key=name, element contains char-map and range-list Functs = {} state = 0 try: defines = open("chvalid.def", "r") except: print "Missing chvalid.def, aborting ..." sys.exit(1) # # The lines in the .def file have three types:- # name: Defines a new function block # ur: Defines individual or ranges of unicode values # end: Indicates the end of the function block # # These lines are processed below. # for line in defines.readlines(): # ignore blank lines, or lines beginning with '#' if line[0] == '#': continue line = string.strip(line) if line == '': continue # split line into space-separated fields, then split on type try: fields = string.split(line, ' ') # # name line: # validate any previous function block already ended # validate this function not already defined # initialize an entry in the function dicitonary # including a mask table with no values yet defined # if fields[0] == 'name': name = fields[1] if state != 0: print "'name' %s found before previous name" \ "completed" % (fields[1]) continue state = 1 if Functs.has_key(name): print "name '%s' already present - may give" \ " wrong results" % (name) else: # dict entry with two list elements (chdata, rangedata) Functs[name] = [ [], [] ] for v in range(256): Functs[name][0].append(0) # # end line: # validate there was a preceding function name line # set state to show no current function active # elif fields[0] == 'end': if state == 0: print "'end' found outside of function block" continue state = 0 # # ur line: # validate function has been defined # process remaining fields on the line, which may be either # individual unicode values or ranges of values # elif fields[0] == 'ur': if state != 1: raise ValidationError, "'ur' found outside of 'name' block" for el in fields[1:]: pos = string.find(el, '..') # pos <=0 means not a range, so must be individual value if pos <= 0: # cheap handling of hex or decimal values if el[0:2] == '0x': value = int(el[2:],16) elif el[0] == "'": value = ord(el[1]) else: value = int(el) if ((value < 0) | (value > 0x1fffff)): raise ValidationError, 'Illegal value (%s) in ch for'\ ' name %s' % (el,name) # for ur we have only ranges (makes things simpler), # so convert val to range currange = (value, value) # pos > 0 means this is a range, so isolate/validate # the interval else: # split the range into it's first-val, last-val (first, last) = string.split(el, "..") # convert values from text into binary if first[0:2] == '0x': start = int(first[2:],16) elif first[0] == "'": start = ord(first[1]) else: start = int(first) if last[0:2] == '0x': end = int(last[2:],16) elif last[0] == "'": end = ord(last[1]) else: end = int(last) if (start < 0) | (end > 0x1fffff) | (start > end): raise ValidationError, "Invalid range '%s'" % el currange = (start, end) # common path - 'currange' has the range, now take care of it # We split on single-byte values vs. multibyte if currange[1] < 0x100: # single-byte for ch in range(currange[0],currange[1]+1): # validate that value not previously defined if Functs[name][0][ch]: msg = "Duplicate ch value '%s' for name '%s'" % (el, name) raise ValidationError, msg Functs[name][0][ch] = 1 else: # multi-byte if Ranges.has_key(currange): Ranges[currange].append(name) else: Ranges[currange] = [ name ] if currange in Functs[name][1]: raise ValidationError, "range already defined in" \ " function" else: Functs[name][1].append(currange) except: print "Failed to process line: %s" % (line) raise # # At this point, the entire definition file has been processed. Now we # enter the output phase, where we generate the two files chvalid.c and' # chvalid.h # # To do this, we first output the 'static' data (heading, fixed # definitions, etc.), then output the 'dynamic' data (the results # of the above processing), and finally output closing 'static' data # (e.g. the subroutine to process the ranges) # # # Generate the headings: # try: header = open("include/libxml/chvalid.h", "w") except: print "Failed to open include/libxml/chvalid.h" sys.exit(1) try: output = open("chvalid.c", "w") except: print "Failed to open chvalid.c" sys.exit(1) date = time.asctime(time.localtime(time.time())) header.write( """/* * chvalid.h: this header exports interfaces for the character * range validation APIs * * This file is automatically generated from the cvs source * definition files using the genChRanges.py Python script * * Generation date: %s * Sources: %s * William Brack */ #ifndef __XML_CHVALID_H__ #define __XML_CHVALID_H__ #ifdef __cplusplus extern "C" { #endif /* * Define our typedefs and structures * */ typedef struct _xmlChSRange xmlChSRange; typedef xmlChSRange *xmlChSRangePtr; struct _xmlChSRange { unsigned short low; unsigned short high; }; typedef struct _xmlChLRange xmlChLRange; typedef xmlChLRange *xmlChLRangePtr; struct _xmlChLRange { unsigned low; unsigned high; }; typedef struct _xmlChRangeGroup xmlChRangeGroup; typedef xmlChRangeGroup *xmlChRangeGroupPtr; struct _xmlChRangeGroup { int nbShortRange; int nbLongRange; xmlChSRangePtr shortRange; /* points to an array of ranges */ xmlChLRangePtr longRange; }; /* Range checking routine */ int xmlCharInRange(unsigned int val, const xmlChRangeGroupPtr group); """ % (date, sources)); output.write( """/* * chvalid.c: this module implements the character range * validation APIs * * This file is automatically generated from the cvs source * definition files using the genChRanges.py Python script * * Generation date: %s * Sources: %s * William Brack */ #include /* * The initial tables ({func_name}_tab) are used to validate whether a * single-byte character is within the specified group. Each table * contains 256 bytes, with each byte representing one of the 256 * possible characters. If the table byte is set, the character is * allowed. * */ """ % (date, sources)); # # Now output the generated data. # We try to produce the best execution times. Tests have shown that validation # with direct table lookup is, when there are a "small" number of valid items, # still not as fast as a sequence of inline compares. So, if the single-byte # portion of a range has a "small" number of ranges, we output a macro for inline # compares, otherwise we output a 256-byte table and a macro to use it. # fkeys = Functs.keys() # Dictionary of all defined functions fkeys.sort() # Put some order to our output for f in fkeys: # First we convert the specified single-byte values into a group of ranges. # If the total number of such ranges is less than minTableSize, we generate # an inline macro for direct comparisons; if greater, we generate a lookup # table. if max(Functs[f][0]) > 0: # only check if at least one entry rangeTable = makeRange(Functs[f][0]) numRanges = len(rangeTable) if numRanges >= minTableSize: # table is worthwhile header.write("extern unsigned char %s_tab[256];\n" % f) header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f)) # write the constant data to the code file output.write("unsigned char %s_tab[256] = {\n" % f) pline = " " for n in range(255): pline += " 0x%02x," % Functs[f][0][n] if len(pline) > 72: output.write(pline + "\n") pline = " " output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255]) else: # inline check is used # first another little optimisation - if space is present, # put it at the front of the list so it is checked first try: ix = rangeTable.remove((0x20, 0x20)) rangeTable.insert(0, (0x20, 0x20)) except: pass pline = "#define %s_ch(c)\t( " % f firstFlag = 1 for rg in rangeTable: if not firstFlag: pline += " || \\\n\t\t\t" else: firstFlag = 0 if rg[0] == rg[1]: # single value - check equal pline += "((c) == " + hex(rg[0]) + ")" else: # value range pline += "((" + hex(rg[0]) + "<= (c)) &&" pline += " ((c) <= " + hex(rg[1]) + "))" pline += ")\n" header.write(pline) header.write("#define %s(c)\t(((c) < 0x100) ? \\\n\t\t\t\t" % f) if max(Functs[f][0]) > 0: header.write("%s_ch((c)) :" % f) else: header.write("0 :") # if no ranges defined, value invalid if >= 0x100 if len(Functs[f][1]) == 0: header.write(" 0)\n\n") else: header.write(" \\\n\t\t\t\txmlCharInRange((c), &%sGroup))\n\n" % f) if len(Functs[f][1]) > 0: header.write("extern xmlChRangeGroup %sGroup;\n" % f) # # Next we do the unicode ranges # for f in fkeys: if len(Functs[f][1]) > 0: # only generate if unicode ranges present rangeTable = Functs[f][1] rangeTable.sort() # ascending tuple sequence numShort = 0 numLong = 0 for rg in rangeTable: if rg[1] < 0x10000: # if short value if numShort == 0: # first occurence pline = "static xmlChSRange %s_srng[] = { " % f else: pline += ", " numShort += 1 if len(pline) > 60: output.write(pline + "\n") pline = " " pline += "{0x%x, 0x%x}" % (rg[0], rg[1]) else: # if long value if numLong == 0: # first occurence if numShort > 0: # if there were shorts, finish them off output.write(pline + "};\n") pline = "static xmlChLRange %s_lrng[] = { " % f else: pline += ", " numLong += 1 if len(pline) > 60: output.write(pline + "\n") pline = " " pline += "{0x%x, 0x%x}" % (rg[0], rg[1]) output.write(pline + "};\n") # finish off last group pline = "xmlChRangeGroup %sGroup = {%d, %d, " % (f, numShort, numLong) if numShort > 0: pline += "%s_srng" % f if numLong > 0: pline += ", %s_lrng" % f output.write(pline + "};\n\n") # # Run complete - write trailers and close the output files # header.write(""" #ifdef __cplusplus } #endif #endif /* __XML_CHVALID_H__ */ """); header.close() output.write( """ int xmlCharInRange (unsigned int val, xmlChRangeGroupPtr rptr) { int low, high, mid; xmlChSRangePtr sptr; xmlChLRangePtr lptr; if (val < 0x10000) { /* is val in 'short' or 'long' array? */ if (rptr->nbShortRange == 0) return 0; low = 0; high = rptr->nbShortRange; sptr = rptr->shortRange; while (low <= high) { mid = (low + high) / 2; if ((unsigned short) val < sptr[mid].low) high = mid - 1; else if ((unsigned short) val > sptr[mid].high) low = mid + 1; else return 1; } } else { if (rptr->nbLongRange == 0) return 0; low = 0; high = rptr->nbLongRange; lptr = rptr->longRange; while (low <= high) { mid = (low + high) / 2; if (val < lptr[mid].low) high = mid - 1; else if (val > lptr[mid].high) low = mid + 1; else return 1; } } return 0; } """); output.close()