darling-libxml2/genChRanges.py

#!/usr/bin/python -u
#
# Portions of this script have been (shamelessly) stolen from the
# prior work of Daniel Veillard (genUnicode.py)
#
# I, however, take full credit for any bugs, errors or difficulties :-)
#
# William Brack
# October 2003
#

import sys
import string
import time

#
# A little routine to assign a 'meaningful' name to a range
#
def rangename( intvl ):
    (start, end) = intvl
    rname = "r" + hex(start)[2:] + "x" + hex(end)[2:]
    return rname

#
# A routine to take a list of yes/no (1, 0) values and turn it
# into a list of ranges.  This will later be used to determine whether
# to generate single-byte lookup tables, or inline comparisons
#
def makeRange(lst):
    ret = []
    pos = 0
    while pos < len(lst):
	try:		# index generates exception if not present
	    s = lst[pos:].index(1)	# look for start of next range
	except:
	    break			# if no more, finished
	pos += s			# pointer to start of possible range
	try:
	    e = lst[pos:].index(0)	# look for end of range
	    e += pos
	except:				# if no end, set to end of list
	    e = len(lst)
	ret.append((pos, e-1))		# append range tuple to list
	pos = e + 1			# ready to check for next range
    return ret

sources = "chvalid.def"			# input filename

# minTableSize gives the minimum number of ranges which must be present
# before a 256-byte lookup table is produced.  If there are less than this
# number, a macro with inline comparisons is generated
minTableSize = 6

# dictionary of ranges, key=range, element contains list of funcs using it
Ranges = {}

# dictionary of functions, key=name, element contains char-map and range-list
Functs = {}

state = 0

try:
    defines = open("chvalid.def", "r")
except:
    print "Missing chvalid.def, aborting ..."
    sys.exit(1)

#
# The lines in the .def file have three types:-
#   name:   Defines a new function block
#   ur:	    Defines individual or ranges of unicode values
#   end:    Indicates the end of the function block
#
# These lines are processed below.
#
for line in defines.readlines():
    # ignore blank lines, or lines beginning with '#'
    if line[0] == '#':
        continue
    line = string.strip(line)
    if line == '':
        continue
    # split line into space-separated fields, then split on type
    try:
        fields = string.split(line, ' ')
	#
	# name line:
	#   validate any previous function block already ended
	#   validate this function not already defined
	#   initialize an entry in the function dicitonary
	#	including a mask table with no values yet defined
	#
	if fields[0] == 'name':
	    name = fields[1]
	    if state != 0:
		print "'name' %s found before previous name" \
		      "completed" % (fields[1])
		continue
	    state = 1
	    if Functs.has_key(name):
		print "name '%s' already present - may give" \
		      " wrong results" % (name)
	    else:
		# dict entry with two list elements (chdata, rangedata)
		Functs[name] = [ [], [] ]
		for v in range(256):
		    Functs[name][0].append(0)
	#
	# end line:
	#   validate there was a preceding function name line
	#   set state to show no current function active
	#
	elif fields[0] == 'end':
	    if state == 0:
		print "'end' found outside of function block"
		continue
	    state = 0

	#
	# ur line:
	#   validate function has been defined
	#   process remaining fields on the line, which may be either
	#	individual unicode values or ranges of values
	#
	elif fields[0] == 'ur':
	    if state != 1:
		raise ValidationError, "'ur' found outside of 'name' block"
	    for el in fields[1:]:
		pos = string.find(el, '..')
		# pos <=0 means not a range, so must be individual value
		if pos <= 0:
		    # cheap handling of hex or decimal values
		    if el[0:2] == '0x':
		        value = int(el[2:],16)
		    elif el[0] == "'":
			value = ord(el[1])
		    else:
			value = int(el)
		    if ((value < 0) | (value > 0x1fffff)):
			raise ValidationError, 'Illegal value (%s) in ch for'\
				' name %s' % (el,name)
		    # for ur we have only ranges (makes things simpler),
		    # so convert val to range
		    currange = (value, value)
		# pos > 0 means this is a range, so isolate/validate
		# the interval
		else:
		    # split the range into it's first-val, last-val
		    (first, last) = string.split(el, "..")
		    # convert values from text into binary
		    if first[0:2] == '0x':
			start = int(first[2:],16)
		    elif first[0] == "'":
			start = ord(first[1])
		    else:
			start = int(first)
		    if last[0:2] == '0x':
			end = int(last[2:],16)
		    elif last[0] == "'":
			end = ord(last[1])
		    else:
			end = int(last)
		    if (start < 0) | (end > 0x1fffff) | (start > end):
			raise ValidationError, "Invalid range '%s'" % el
		    currange = (start, end)
		# common path - 'currange' has the range, now take care of it
		# We split on single-byte values vs. multibyte
		if currange[1] < 0x100:	# single-byte
		    for ch in range(currange[0],currange[1]+1):
			# validate that value not previously defined
			if Functs[name][0][ch]:
			    msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
			    raise ValidationError, msg
			Functs[name][0][ch] = 1
		else:			# multi-byte
		    if Ranges.has_key(currange):
			Ranges[currange].append(name)
		    else:
			Ranges[currange] = [ name ]
		    if currange in Functs[name][1]:
			raise ValidationError, "range already defined in" \
				" function"
		    else:
			Functs[name][1].append(currange)

    except:
	print "Failed to process line: %s" % (line)
	raise
#
# At this point, the entire definition file has been processed.  Now we
# enter the output phase, where we generate the two files chvalid.c and'
# chvalid.h
#
# To do this, we first output the 'static' data (heading, fixed
# definitions, etc.), then output the 'dynamic' data (the results
# of the above processing), and finally output closing 'static' data
# (e.g. the subroutine to process the ranges)
#

#
# Generate the headings:
#
try:
    header = open("include/libxml/chvalid.h", "w")
except:
    print "Failed to open include/libxml/chvalid.h"
    sys.exit(1)

try:
    output = open("chvalid.c", "w")
except:
    print "Failed to open chvalid.c"
    sys.exit(1)

date = time.asctime(time.localtime(time.time()))

header.write(
"""/*
 * chvalid.h: this header exports interfaces for the character
 *		 range validation APIs
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
 *
 * Generation date: %s
 * Sources: %s
 * William Brack <wbrack@mmm.com.hk>
 */

#ifndef __XML_CHVALID_H__
#define __XML_CHVALID_H__

#ifdef __cplusplus
extern "C" {
#endif

/*
 * Define our typedefs and structures
 *
 */
typedef struct _xmlChSRange xmlChSRange;
typedef xmlChSRange *xmlChSRangePtr;
struct _xmlChSRange {
    unsigned short	low;
    unsigned short	high;
};

typedef struct _xmlChLRange xmlChLRange;
typedef xmlChLRange *xmlChLRangePtr;
struct _xmlChLRange {
    unsigned		low;
    unsigned		high;
};

typedef struct _xmlChRangeGroup xmlChRangeGroup;
typedef xmlChRangeGroup *xmlChRangeGroupPtr;
struct _xmlChRangeGroup {
    int			nbShortRange;
    int			nbLongRange;
    xmlChSRangePtr	shortRange;	/* points to an array of ranges */
    xmlChLRangePtr	longRange;
};

/* Range checking routine */
int xmlCharInRange(unsigned int val, const xmlChRangeGroupPtr group);

""" % (date, sources));
output.write(
"""/*
 * chvalid.c:	this module implements the character range
 *		validation APIs
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
 *
 * Generation date: %s
 * Sources: %s
 * William Brack <wbrack@mmm.com.hk>
 */

#include <libxml/chvalid.h>

/*
 * The initial tables ({func_name}_tab) are used to validate whether a
 * single-byte character is within the specified group.  Each table
 * contains 256 bytes, with each byte representing one of the 256
 * possible characters.  If the table byte is set, the character is
 * allowed.
 *
 */
""" % (date, sources));

#
# Now output the generated data.
# We try to produce the best execution times.  Tests have shown that validation
# with direct table lookup is, when there are a "small" number of valid items,
# still not as fast as a sequence of inline compares.  So, if the single-byte
# portion of a range has a "small" number of ranges, we output a macro for inline
# compares, otherwise we output a 256-byte table and a macro to use it.
#

fkeys = Functs.keys()	# Dictionary of all defined functions
fkeys.sort()		# Put some order to our output

for f in fkeys:

# First we convert the specified single-byte values into a group of ranges.
# If the total number of such ranges is less than minTableSize, we generate
# an inline macro for direct comparisons; if greater, we generate a lookup
# table.
    if max(Functs[f][0]) > 0:	# only check if at least one entry
        rangeTable = makeRange(Functs[f][0])
	numRanges = len(rangeTable)
	if numRanges >= minTableSize:	# table is worthwhile
	    header.write("extern unsigned char %s_tab[256];\n" % f)
	    header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))

	    # write the constant data to the code file
	    output.write("unsigned char %s_tab[256] = {\n" % f)
	    pline = "   "
	    for n in range(255):
		pline += " 0x%02x," % Functs[f][0][n]
		if len(pline) > 72:
		    output.write(pline + "\n")
		    pline = "   "
	    output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])

	else:		# inline check is used
	    # first another little optimisation - if space is present,
	    # put it at the front of the list so it is checked first
	    try:
		ix = rangeTable.remove((0x20, 0x20))
		rangeTable.insert(0, (0x20, 0x20))
	    except:
		pass
	    pline = "#define %s_ch(c)\t( " % f
	    firstFlag = 1
	    for rg in rangeTable:
		if not firstFlag:
		    pline += " || \\\n\t\t\t"
		else:
		    firstFlag = 0
		if rg[0] == rg[1]:		# single value - check equal
		    pline += "((c) == " + hex(rg[0]) + ")"
		else:			# value range
		    pline += "((" + hex(rg[0]) + "<= (c)) &&"
		    pline += " ((c) <= " + hex(rg[1]) + "))"
	    pline += ")\n"
	    header.write(pline)

    header.write("#define %s(c)\t(((c) < 0x100) ? \\\n\t\t\t\t" % f)
    if max(Functs[f][0]) > 0:
	header.write("%s_ch((c)) :" % f)
    else:
	header.write("0 :")

    # if no ranges defined, value invalid if >= 0x100
    if len(Functs[f][1]) == 0:
	header.write(" 0)\n\n")
    else:
	header.write(" \\\n\t\t\t\txmlCharInRange((c), &%sGroup))\n\n"  % f)

    if len(Functs[f][1]) > 0:
	header.write("extern xmlChRangeGroup %sGroup;\n" % f)


#
# Next we do the unicode ranges
#

for f in fkeys:
    if len(Functs[f][1]) > 0:	# only generate if unicode ranges present
	rangeTable = Functs[f][1]
	rangeTable.sort()	# ascending tuple sequence
	numShort = 0
	numLong  = 0
	for rg in rangeTable:
	    if rg[1] < 0x10000:	# if short value
		if numShort == 0:	# first occurence
		    pline = "static xmlChSRange %s_srng[] = { " % f
		else:
		    pline += ", "
		numShort += 1
		if len(pline) > 60:
		    output.write(pline + "\n")
		    pline = "    "
		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
	    else:		# if long value
		if numLong == 0:	# first occurence
		    if numShort > 0:	# if there were shorts, finish them off
			output.write(pline + "};\n")
		    pline = "static xmlChLRange %s_lrng[] = { " % f
		else:
		    pline += ", "
		numLong += 1
		if len(pline) > 60:
		    output.write(pline + "\n")
		    pline = "    "
		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
	output.write(pline + "};\n")	# finish off last group

	pline = "xmlChRangeGroup %sGroup = {%d, %d, " % (f, numShort, numLong)
	if numShort > 0:
	    pline += "%s_srng" % f
	if numLong > 0:
	    pline += ", %s_lrng" % f

	output.write(pline + "};\n\n")
#
# Run complete - write trailers and close the output files
#

header.write("""
#ifdef __cplusplus
}
#endif
#endif /* __XML_CHVALID_H__ */
""");

header.close()

output.write(
"""
int
xmlCharInRange (unsigned int val, xmlChRangeGroupPtr rptr) {
    int low, high, mid;
    xmlChSRangePtr sptr;
    xmlChLRangePtr lptr;
    if (val < 0x10000) {	/* is val in 'short' or 'long'  array? */
	if (rptr->nbShortRange == 0)
	    return 0;
	low = 0;
	high = rptr->nbShortRange;
	sptr = rptr->shortRange;
	while (low <= high) {
	    mid = (low + high) / 2;
	    if ((unsigned short) val < sptr[mid].low)
		high = mid - 1;
	    else if ((unsigned short) val > sptr[mid].high)
		low = mid + 1;
	    else
		return 1;
	}
    } else {
	if (rptr->nbLongRange == 0)
	    return 0;
	low = 0;
	high = rptr->nbLongRange;
	lptr = rptr->longRange;
	while (low <= high) {
	    mid = (low + high) / 2;
	    if (val < lptr[mid].low)
		high = mid - 1;
	    else if (val > lptr[mid].high)
		low = mid + 1;
	    else
		return 1;
	}
    }
    return 0;
}

""");

output.close()