mirror of
https://github.com/darlinghq/darling-libxml2.git
synced 2024-12-04 10:23:55 +00:00
1a99396b18
* check-xml-test-suite.py: removed some annoying warnings * chvalid.def chvalid.c include/libxml/chvalid.h: fixed a bug in the PubidChars definition, regenerated, there is still a bug left somewhere * genChRanges.py: save the header directly in include/libxml/ * configure.in: I generated a 2.6.0beta6 earlier today Daniel
466 lines
12 KiB
Python
Executable File
466 lines
12 KiB
Python
Executable File
#!/usr/bin/python -u
|
|
#
|
|
# Portions of this script have been (shamelessly) stolen from the
|
|
# prior work of Daniel Veillard (genUnicode.py)
|
|
#
|
|
# I, however, take full credit for any bugs, errors or difficulties :-)
|
|
#
|
|
# William Brack
|
|
# October 2003
|
|
#
|
|
|
|
import sys
|
|
import string
|
|
import time
|
|
|
|
#
|
|
# A little routine to assign a 'meaningful' name to a range
|
|
#
|
|
def rangename( intvl ):
|
|
(start, end) = intvl
|
|
rname = "r" + hex(start)[2:] + "x" + hex(end)[2:]
|
|
return rname
|
|
|
|
#
|
|
# A routine to take a list of yes/no (1, 0) values and turn it
|
|
# into a list of ranges. This will later be used to determine whether
|
|
# to generate single-byte lookup tables, or inline comparisons
|
|
#
|
|
def makeRange(lst):
|
|
ret = []
|
|
pos = 0
|
|
while pos < len(lst):
|
|
try: # index generates exception if not present
|
|
s = lst[pos:].index(1) # look for start of next range
|
|
except:
|
|
break # if no more, finished
|
|
pos += s # pointer to start of possible range
|
|
try:
|
|
e = lst[pos:].index(0) # look for end of range
|
|
e += pos
|
|
except: # if no end, set to end of list
|
|
e = len(lst)
|
|
ret.append((pos, e-1)) # append range tuple to list
|
|
pos = e + 1 # ready to check for next range
|
|
return ret
|
|
|
|
sources = "chvalid.def" # input filename
|
|
|
|
# minTableSize gives the minimum number of ranges which must be present
|
|
# before a 256-byte lookup table is produced. If there are less than this
|
|
# number, a macro with inline comparisons is generated
|
|
minTableSize = 6
|
|
|
|
# dictionary of ranges, key=range, element contains list of funcs using it
|
|
Ranges = {}
|
|
|
|
# dictionary of functions, key=name, element contains char-map and range-list
|
|
Functs = {}
|
|
|
|
state = 0
|
|
|
|
try:
|
|
defines = open("chvalid.def", "r")
|
|
except:
|
|
print "Missing chvalid.def, aborting ..."
|
|
sys.exit(1)
|
|
|
|
#
|
|
# The lines in the .def file have three types:-
|
|
# name: Defines a new function block
|
|
# ur: Defines individual or ranges of unicode values
|
|
# end: Indicates the end of the function block
|
|
#
|
|
# These lines are processed below.
|
|
#
|
|
for line in defines.readlines():
|
|
# ignore blank lines, or lines beginning with '#'
|
|
if line[0] == '#':
|
|
continue
|
|
line = string.strip(line)
|
|
if line == '':
|
|
continue
|
|
# split line into space-separated fields, then split on type
|
|
try:
|
|
fields = string.split(line, ' ')
|
|
#
|
|
# name line:
|
|
# validate any previous function block already ended
|
|
# validate this function not already defined
|
|
# initialize an entry in the function dicitonary
|
|
# including a mask table with no values yet defined
|
|
#
|
|
if fields[0] == 'name':
|
|
name = fields[1]
|
|
if state != 0:
|
|
print "'name' %s found before previous name" \
|
|
"completed" % (fields[1])
|
|
continue
|
|
state = 1
|
|
if Functs.has_key(name):
|
|
print "name '%s' already present - may give" \
|
|
" wrong results" % (name)
|
|
else:
|
|
# dict entry with two list elements (chdata, rangedata)
|
|
Functs[name] = [ [], [] ]
|
|
for v in range(256):
|
|
Functs[name][0].append(0)
|
|
#
|
|
# end line:
|
|
# validate there was a preceding function name line
|
|
# set state to show no current function active
|
|
#
|
|
elif fields[0] == 'end':
|
|
if state == 0:
|
|
print "'end' found outside of function block"
|
|
continue
|
|
state = 0
|
|
|
|
#
|
|
# ur line:
|
|
# validate function has been defined
|
|
# process remaining fields on the line, which may be either
|
|
# individual unicode values or ranges of values
|
|
#
|
|
elif fields[0] == 'ur':
|
|
if state != 1:
|
|
raise ValidationError, "'ur' found outside of 'name' block"
|
|
for el in fields[1:]:
|
|
pos = string.find(el, '..')
|
|
# pos <=0 means not a range, so must be individual value
|
|
if pos <= 0:
|
|
# cheap handling of hex or decimal values
|
|
if el[0:2] == '0x':
|
|
value = int(el[2:],16)
|
|
elif el[0] == "'":
|
|
value = ord(el[1])
|
|
else:
|
|
value = int(el)
|
|
if ((value < 0) | (value > 0x1fffff)):
|
|
raise ValidationError, 'Illegal value (%s) in ch for'\
|
|
' name %s' % (el,name)
|
|
# for ur we have only ranges (makes things simpler),
|
|
# so convert val to range
|
|
currange = (value, value)
|
|
# pos > 0 means this is a range, so isolate/validate
|
|
# the interval
|
|
else:
|
|
# split the range into it's first-val, last-val
|
|
(first, last) = string.split(el, "..")
|
|
# convert values from text into binary
|
|
if first[0:2] == '0x':
|
|
start = int(first[2:],16)
|
|
elif first[0] == "'":
|
|
start = ord(first[1])
|
|
else:
|
|
start = int(first)
|
|
if last[0:2] == '0x':
|
|
end = int(last[2:],16)
|
|
elif last[0] == "'":
|
|
end = ord(last[1])
|
|
else:
|
|
end = int(last)
|
|
if (start < 0) | (end > 0x1fffff) | (start > end):
|
|
raise ValidationError, "Invalid range '%s'" % el
|
|
currange = (start, end)
|
|
# common path - 'currange' has the range, now take care of it
|
|
# We split on single-byte values vs. multibyte
|
|
if currange[1] < 0x100: # single-byte
|
|
for ch in range(currange[0],currange[1]+1):
|
|
# validate that value not previously defined
|
|
if Functs[name][0][ch]:
|
|
msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
|
|
raise ValidationError, msg
|
|
Functs[name][0][ch] = 1
|
|
else: # multi-byte
|
|
if Ranges.has_key(currange):
|
|
Ranges[currange].append(name)
|
|
else:
|
|
Ranges[currange] = [ name ]
|
|
if currange in Functs[name][1]:
|
|
raise ValidationError, "range already defined in" \
|
|
" function"
|
|
else:
|
|
Functs[name][1].append(currange)
|
|
|
|
except:
|
|
print "Failed to process line: %s" % (line)
|
|
raise
|
|
#
|
|
# At this point, the entire definition file has been processed. Now we
|
|
# enter the output phase, where we generate the two files chvalid.c and'
|
|
# chvalid.h
|
|
#
|
|
# To do this, we first output the 'static' data (heading, fixed
|
|
# definitions, etc.), then output the 'dynamic' data (the results
|
|
# of the above processing), and finally output closing 'static' data
|
|
# (e.g. the subroutine to process the ranges)
|
|
#
|
|
|
|
#
|
|
# Generate the headings:
|
|
#
|
|
try:
|
|
header = open("include/libxml/chvalid.h", "w")
|
|
except:
|
|
print "Failed to open include/libxml/chvalid.h"
|
|
sys.exit(1)
|
|
|
|
try:
|
|
output = open("chvalid.c", "w")
|
|
except:
|
|
print "Failed to open chvalid.c"
|
|
sys.exit(1)
|
|
|
|
date = time.asctime(time.localtime(time.time()))
|
|
|
|
header.write(
|
|
"""/*
|
|
* chvalid.h: this header exports interfaces for the character
|
|
* range validation APIs
|
|
*
|
|
* This file is automatically generated from the cvs source
|
|
* definition files using the genChRanges.py Python script
|
|
*
|
|
* Generation date: %s
|
|
* Sources: %s
|
|
* William Brack <wbrack@mmm.com.hk>
|
|
*/
|
|
|
|
#ifndef __XML_CHVALID_H__
|
|
#define __XML_CHVALID_H__
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/*
|
|
* Define our typedefs and structures
|
|
*
|
|
*/
|
|
typedef struct _xmlChSRange xmlChSRange;
|
|
typedef xmlChSRange *xmlChSRangePtr;
|
|
struct _xmlChSRange {
|
|
unsigned short low;
|
|
unsigned short high;
|
|
};
|
|
|
|
typedef struct _xmlChLRange xmlChLRange;
|
|
typedef xmlChLRange *xmlChLRangePtr;
|
|
struct _xmlChLRange {
|
|
unsigned low;
|
|
unsigned high;
|
|
};
|
|
|
|
typedef struct _xmlChRangeGroup xmlChRangeGroup;
|
|
typedef xmlChRangeGroup *xmlChRangeGroupPtr;
|
|
struct _xmlChRangeGroup {
|
|
int nbShortRange;
|
|
int nbLongRange;
|
|
xmlChSRangePtr shortRange; /* points to an array of ranges */
|
|
xmlChLRangePtr longRange;
|
|
};
|
|
|
|
/* Range checking routine */
|
|
int xmlCharInRange(unsigned int val, const xmlChRangeGroupPtr group);
|
|
|
|
""" % (date, sources));
|
|
output.write(
|
|
"""/*
|
|
* chvalid.c: this module implements the character range
|
|
* validation APIs
|
|
*
|
|
* This file is automatically generated from the cvs source
|
|
* definition files using the genChRanges.py Python script
|
|
*
|
|
* Generation date: %s
|
|
* Sources: %s
|
|
* William Brack <wbrack@mmm.com.hk>
|
|
*/
|
|
|
|
#include <libxml/chvalid.h>
|
|
|
|
/*
|
|
* The initial tables ({func_name}_tab) are used to validate whether a
|
|
* single-byte character is within the specified group. Each table
|
|
* contains 256 bytes, with each byte representing one of the 256
|
|
* possible characters. If the table byte is set, the character is
|
|
* allowed.
|
|
*
|
|
*/
|
|
""" % (date, sources));
|
|
|
|
#
|
|
# Now output the generated data.
|
|
# We try to produce the best execution times. Tests have shown that validation
|
|
# with direct table lookup is, when there are a "small" number of valid items,
|
|
# still not as fast as a sequence of inline compares. So, if the single-byte
|
|
# portion of a range has a "small" number of ranges, we output a macro for inline
|
|
# compares, otherwise we output a 256-byte table and a macro to use it.
|
|
#
|
|
|
|
fkeys = Functs.keys() # Dictionary of all defined functions
|
|
fkeys.sort() # Put some order to our output
|
|
|
|
for f in fkeys:
|
|
|
|
# First we convert the specified single-byte values into a group of ranges.
|
|
# If the total number of such ranges is less than minTableSize, we generate
|
|
# an inline macro for direct comparisons; if greater, we generate a lookup
|
|
# table.
|
|
if max(Functs[f][0]) > 0: # only check if at least one entry
|
|
rangeTable = makeRange(Functs[f][0])
|
|
numRanges = len(rangeTable)
|
|
if numRanges >= minTableSize: # table is worthwhile
|
|
header.write("extern unsigned char %s_tab[256];\n" % f)
|
|
header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
|
|
|
|
# write the constant data to the code file
|
|
output.write("unsigned char %s_tab[256] = {\n" % f)
|
|
pline = " "
|
|
for n in range(255):
|
|
pline += " 0x%02x," % Functs[f][0][n]
|
|
if len(pline) > 72:
|
|
output.write(pline + "\n")
|
|
pline = " "
|
|
output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])
|
|
|
|
else: # inline check is used
|
|
# first another little optimisation - if space is present,
|
|
# put it at the front of the list so it is checked first
|
|
try:
|
|
ix = rangeTable.remove((0x20, 0x20))
|
|
rangeTable.insert(0, (0x20, 0x20))
|
|
except:
|
|
pass
|
|
pline = "#define %s_ch(c)\t( " % f
|
|
firstFlag = 1
|
|
for rg in rangeTable:
|
|
if not firstFlag:
|
|
pline += " || \\\n\t\t\t"
|
|
else:
|
|
firstFlag = 0
|
|
if rg[0] == rg[1]: # single value - check equal
|
|
pline += "((c) == " + hex(rg[0]) + ")"
|
|
else: # value range
|
|
pline += "((" + hex(rg[0]) + "<= (c)) &&"
|
|
pline += " ((c) <= " + hex(rg[1]) + "))"
|
|
pline += ")\n"
|
|
header.write(pline)
|
|
|
|
header.write("#define %s(c)\t(((c) < 0x100) ? \\\n\t\t\t\t" % f)
|
|
if max(Functs[f][0]) > 0:
|
|
header.write("%s_ch((c)) :" % f)
|
|
else:
|
|
header.write("0 :")
|
|
|
|
# if no ranges defined, value invalid if >= 0x100
|
|
if len(Functs[f][1]) == 0:
|
|
header.write(" 0)\n\n")
|
|
else:
|
|
header.write(" \\\n\t\t\t\txmlCharInRange((c), &%sGroup))\n\n" % f)
|
|
|
|
if len(Functs[f][1]) > 0:
|
|
header.write("extern xmlChRangeGroup %sGroup;\n" % f)
|
|
|
|
|
|
#
|
|
# Next we do the unicode ranges
|
|
#
|
|
|
|
for f in fkeys:
|
|
if len(Functs[f][1]) > 0: # only generate if unicode ranges present
|
|
rangeTable = Functs[f][1]
|
|
rangeTable.sort() # ascending tuple sequence
|
|
numShort = 0
|
|
numLong = 0
|
|
for rg in rangeTable:
|
|
if rg[1] < 0x10000: # if short value
|
|
if numShort == 0: # first occurence
|
|
pline = "static xmlChSRange %s_srng[] = { " % f
|
|
else:
|
|
pline += ", "
|
|
numShort += 1
|
|
if len(pline) > 60:
|
|
output.write(pline + "\n")
|
|
pline = " "
|
|
pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
|
|
else: # if long value
|
|
if numLong == 0: # first occurence
|
|
if numShort > 0: # if there were shorts, finish them off
|
|
output.write(pline + "};\n")
|
|
pline = "static xmlChLRange %s_lrng[] = { " % f
|
|
else:
|
|
pline += ", "
|
|
numLong += 1
|
|
if len(pline) > 60:
|
|
output.write(pline + "\n")
|
|
pline = " "
|
|
pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
|
|
output.write(pline + "};\n") # finish off last group
|
|
|
|
pline = "xmlChRangeGroup %sGroup = {%d, %d, " % (f, numShort, numLong)
|
|
if numShort > 0:
|
|
pline += "%s_srng" % f
|
|
if numLong > 0:
|
|
pline += ", %s_lrng" % f
|
|
|
|
output.write(pline + "};\n\n")
|
|
#
|
|
# Run complete - write trailers and close the output files
|
|
#
|
|
|
|
header.write("""
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
#endif /* __XML_CHVALID_H__ */
|
|
""");
|
|
|
|
header.close()
|
|
|
|
output.write(
|
|
"""
|
|
int
|
|
xmlCharInRange (unsigned int val, xmlChRangeGroupPtr rptr) {
|
|
int low, high, mid;
|
|
xmlChSRangePtr sptr;
|
|
xmlChLRangePtr lptr;
|
|
if (val < 0x10000) { /* is val in 'short' or 'long' array? */
|
|
if (rptr->nbShortRange == 0)
|
|
return 0;
|
|
low = 0;
|
|
high = rptr->nbShortRange;
|
|
sptr = rptr->shortRange;
|
|
while (low <= high) {
|
|
mid = (low + high) / 2;
|
|
if ((unsigned short) val < sptr[mid].low)
|
|
high = mid - 1;
|
|
else if ((unsigned short) val > sptr[mid].high)
|
|
low = mid + 1;
|
|
else
|
|
return 1;
|
|
}
|
|
} else {
|
|
if (rptr->nbLongRange == 0)
|
|
return 0;
|
|
low = 0;
|
|
high = rptr->nbLongRange;
|
|
lptr = rptr->longRange;
|
|
while (low <= high) {
|
|
mid = (low + high) / 2;
|
|
if (val < lptr[mid].low)
|
|
high = mid - 1;
|
|
else if (val > lptr[mid].high)
|
|
low = mid + 1;
|
|
else
|
|
return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
""");
|
|
|
|
output.close()
|