darling-libxml2/genUnicode.py
Daniel Veillard 4255d50415 merged the current state of XML Schemas implementation, it is not
* Makefile.am TODO_SCHEMAS configure.in genUnicode.py testAutomata.c
 testRegexp.c testSchemas.c xmlregexp.c xmlschemas.c xmlschemastypes.c
 xmlunicode.c include/libxml/Makefile.am
 include/libxml/schemasInternals.h include/libxml/xmlautomata.h
 include/libxml/xmlregexp.h include/libxml/xmlschemas.h
 include/libxml/xmlschemastypes.h include/libxml/xmlunicode.h
 include/libxml/xmlversion.h.in : merged the current state of
 XML Schemas implementation, it is not configured in by default,
 a specific --schemas configure option has been added.
* test/automata test/regexp test/schemas Makefile.am
  result/automata result/regexp result/schemas:
  merged automata/regexp/schemas regression tests
Daniel
2002-04-16 15:50:10 +00:00

257 lines
6.9 KiB
Python
Executable File

#!/usr/bin/python -u
import sys
import string
import time
sources = "Blocks-4.txt UnicodeData-3.1.0.txt"
try:
blocks = open("Blocks-4.txt", "r")
except:
print "Missing Blocks-4.txt, aborting ..."
sys.exit(1)
BlockNames = {}
for line in blocks.readlines():
if line[0] == '#':
continue
line = string.strip(line)
if line == '':
continue
try:
fields = string.split(line, ';')
range = string.strip(fields[0])
(start, end) = string.split(range, "..")
name = string.strip(fields[1])
name = string.replace(name, ' ', '')
except:
print "Failed to process line: %s" % (line)
continue
BlockNames[name] = ("0x"+start, "0x"+end)
blocks.close()
print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
try:
data = open("UnicodeData-3.1.0.txt", "r")
except:
print "Missing UnicodeData-3.1.0.txt, aborting ..."
sys.exit(1)
nbchar = 0;
Categories = {}
for line in data.readlines():
if line[0] == '#':
continue
line = string.strip(line)
if line == '':
continue
try:
fields = string.split(line, ';')
point = string.strip(fields[0])
value = 0
while point != '':
value = value * 16
if point[0] >= '0' and point[0] <= '9':
value = value + ord(point[0]) - ord('0')
elif point[0] >= 'A' and point[0] <= 'F':
value = value + 10 + ord(point[0]) - ord('A')
elif point[0] >= 'a' and point[0] <= 'f':
value = value + 10 + ord(point[0]) - ord('a')
point = point[1:]
name = fields[2]
except:
print "Failed to process line: %s" % (line)
continue
nbchar = nbchar + 1
try:
Categories[name].append(value)
except:
try:
Categories[name] = [value]
except:
print "Failed to process line: %s" % (line)
try:
Categories[name[0]].append(value)
except:
try:
Categories[name[0]] = [value]
except:
print "Failed to process line: %s" % (line)
blocks.close()
print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
#reduce the number list into ranges
for cat in Categories.keys():
list = Categories[cat]
start = -1
prev = -1
end = -1
ranges = []
for val in list:
if start == -1:
start = val
prev = val
continue
elif val == prev + 1:
prev = val
continue
elif prev == start:
ranges.append((prev, prev))
start = val
prev = val
continue
else:
ranges.append((start, prev))
start = val
prev = val
continue
if prev == start:
ranges.append((prev, prev))
else:
ranges.append((start, prev))
Categories[cat] = ranges
#
# Generate the resulting files
#
try:
header = open("xmlunicode.h", "w")
except:
print "Failed to open xmlunicode.h"
sys.exit(1)
try:
output = open("xmlunicode.c", "w")
except:
print "Failed to open xmlunicode.c"
sys.exit(1)
date = time.asctime(time.localtime(time.time()))
header.write(
"""/*
* xmlunicode.h: this header exports interfaces for the Unicode character APIs
*
* This file is automatically generated from the
* UCS description files of the Unicode Character Database
* http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
* using the genUnicode.py Python script.
*
* Generation date: %s
* Sources: %s
* Daniel Veillard <veillard@redhat.com>
*/
#ifndef __XML_UNICODE_H__
#define __XML_UNICODE_H__
#ifdef __cplusplus
extern "C" {
#endif
""" % (date, sources));
output.write(
"""/*
* xmlunicode.c: this module implements the Unicode character APIs
*
* This file is automatically generated from the
* UCS description files of the Unicode Character Database
* http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
* using the genUnicode.py Python script.
*
* Generation date: %s
* Sources: %s
* Daniel Veillard <veillard@redhat.com>
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_UNICODE_ENABLED
#include <string.h>
#include <libxml/xmlversion.h>
#include <libxml/xmlunicode.h>
""" % (date, sources));
keys = BlockNames.keys()
keys.sort()
for block in keys:
(start, end) = BlockNames[block]
name = string.replace(block, '-', '')
header.write("int\txmlUCSIs%s\t(int code);\n" % name)
output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
(block))
output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
output.write("int\nxmlUCSIs%s(int code) {\n" % name)
output.write(" return((code >= %s) && (code <= %s));\n" % (start, end))
output.write("}\n\n")
header.write("\nint\txmlUCSIsBlock\t(int code,\n\t\t\t const char *block);\n\n")
output.write("/**\n * xmlUCSIsBlock:\n * @code: UCS code point\n")
output.write(" * @block: UCS block name\n")
output.write(" *\n * Check whether the caracter is part of the UCS Block\n")
output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown block\n */\n");
output.write("int\nxmlUCSIsBlock(int code, const char *block) {\n")
keys = BlockNames.keys()
keys.sort()
for block in keys:
name = string.replace(block, '-', '')
output.write(" if (!strcmp(block, \"%s\"))\n return(xmlUCSIs%s(code));\n" %
(block, name));
output.write(" return(-1);\n}\n\n")
keys = Categories.keys()
keys.sort()
for name in keys:
ranges = Categories[name]
header.write("int\txmlUCSIsCat%s\t(int code);\n" % name)
output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
(name))
output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
start = 1
for range in ranges:
(begin, end) = range;
if start:
output.write(" return(");
start = 0
else:
output.write(" ||\n ");
if (begin == end):
output.write("(code == %s)" % (hex(begin)))
else:
output.write("((code >= %s) && (code <= %s))" % (
hex(begin), hex(end)))
output.write(");\n}\n\n")
header.write("\nint\txmlUCSIsCat\t(int code,\n\t\t\t const char *cat);\n")
output.write("/**\n * xmlUCSIsCat:\n * @code: UCS code point\n")
output.write(" * @cat: UCS Category name\n")
output.write(" *\n * Check whether the caracter is part of the UCS Category\n")
output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown category\n */\n");
output.write("int\nxmlUCSIsCat(int code, const char *cat) {\n")
keys = Categories.keys()
keys.sort()
for name in keys:
output.write(" if (!strcmp(cat, \"%s\"))\n return(xmlUCSIsCat%s(code));\n" %
(name, name));
output.write(" return(-1);\n}\n\n")
header.write("""
#ifdef __cplusplus
}
#endif
#endif /* __XML_UNICODE_H__ */
""");
output.write("""
#endif /* LIBXML_UNICODE_ENABLED */
""");
header.close()
output.close()