mirror of
https://github.com/darlinghq/darling-JavaScriptCore.git
synced 2025-04-13 12:20:25 +00:00
988 lines
42 KiB
Python
988 lines
42 KiB
Python
#!/usr/bin/env python
|
|
|
|
# Copyright (C) 2017 Apple Inc. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
|
|
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
# This tool processes the Unicode Character Database file CaseFolding.txt to create
|
|
# canonicalization table as decribed in ECMAScript 6 standard in section
|
|
# "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2.
|
|
|
|
import sys
|
|
import copy
|
|
import optparse
|
|
import os
|
|
import re
|
|
from hasher import stringHash
|
|
|
|
header = """/*
|
|
* Copyright (C) 2017-2020 Apple Inc. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
// DO NO EDIT! - This file was generated by """ + __file__ + """
|
|
"""
|
|
|
|
|
|
footer = """
|
|
"""
|
|
|
|
RequiredUCDFiles = ["DerivedBinaryProperties.txt", "DerivedCoreProperties.txt", "DerivedNormalizationProps.txt", "PropList.txt", "PropertyAliases.txt", "PropertyValueAliases.txt", "ScriptExtensions.txt", "UnicodeData.txt", "emoji-data.txt"]
|
|
UCDDirectoryPath = None
|
|
|
|
SupportedBinaryProperties = [
|
|
"Alphabetic", "Any", "ASCII", "ASCII_Hex_Digit", "Assigned", "Bidi_Control", "Bidi_Mirrored", "Case_Ignorable",
|
|
"Cased", "Changes_When_Casefolded", "Changes_When_Casemapped", "Changes_When_Lowercased", "Changes_When_NFKC_Casefolded",
|
|
"Changes_When_Titlecased", "Changes_When_Uppercased", "Dash", "Default_Ignorable_Code_Point", "Deprecated",
|
|
"Diacritic", "Emoji", "Emoji_Component", "Emoji_Modifier_Base", "Emoji_Modifier", "Emoji_Presentation",
|
|
"Extended_Pictographic", "Extender", "Grapheme_Base", "Grapheme_Extend", "Hex_Digit", "ID_Continue", "ID_Start",
|
|
"Ideographic", "IDS_Binary_Operator", "IDS_Trinary_Operator", "Join_Control", "Logical_Order_Exception", "Lowercase",
|
|
"Math", "Noncharacter_Code_Point", "Pattern_Syntax", "Pattern_White_Space", "Quotation_Mark", "Radical",
|
|
"Regional_Indicator", "Sentence_Terminal", "Soft_Dotted", "Terminal_Punctuation", "Unified_Ideograph", "Uppercase",
|
|
"Variation_Selector", "White_Space", "XID_Continue", "XID_Start"]
|
|
|
|
lastASCIICodePoint = 0x7f
|
|
firstUnicodeCodePoint = 0x80
|
|
MaxUnicode = 0x10ffff
|
|
MaxBMP = 0xffff
|
|
commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE)
|
|
aliases = None
|
|
|
|
|
|
def openOrExit(path, mode):
|
|
try:
|
|
if sys.version_info.major >= 3:
|
|
return open(path, mode, encoding="UTF-8")
|
|
else:
|
|
return open(path, mode)
|
|
except IOError as e:
|
|
print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
|
|
exit(1)
|
|
|
|
|
|
def openUCDFileOrExit(path):
|
|
if not UCDDirectoryPath:
|
|
exit(1)
|
|
|
|
return openOrExit(os.path.join(UCDDirectoryPath, path), 'r')
|
|
|
|
|
|
def verifyUCDFilesExist():
|
|
if not UCDDirectoryPath:
|
|
exit(1)
|
|
|
|
missingFileCount = 0
|
|
for file in RequiredUCDFiles:
|
|
fullPath = os.path.join(UCDDirectoryPath, file)
|
|
if not os.path.exists(fullPath):
|
|
print("Couldn't find UCD file {0} at {1}".format(file, fullPath))
|
|
missingFileCount = missingFileCount + 1
|
|
if missingFileCount:
|
|
exit(1)
|
|
|
|
|
|
def ceilingToPowerOf2(size):
|
|
powerOf2 = 1
|
|
while size > powerOf2:
|
|
powerOf2 = powerOf2 << 1
|
|
|
|
return powerOf2
|
|
|
|
|
|
class Aliases:
|
|
def __init__(self):
|
|
self.globalNameToAliases = {}
|
|
self.generalCategoryToAliases = {}
|
|
self.aliasToGeneralCategory = {}
|
|
self.scriptToAliases = {}
|
|
self.aliasToScript = {}
|
|
|
|
def parsePropertyAliasesFile(self, file):
|
|
for line in file:
|
|
line = line.split('#', 1)[0]
|
|
line = line.rstrip()
|
|
if (not len(line)):
|
|
continue
|
|
|
|
fields = line.split(';')
|
|
if (not fields):
|
|
continue
|
|
|
|
aliases = [fields[0].strip()]
|
|
fullName = fields[1].strip()
|
|
for otherAlias in fields[2:]:
|
|
aliases.append(otherAlias.strip())
|
|
|
|
if fullName in self.globalNameToAliases:
|
|
print("Error, already an alias for {}".format(fullName))
|
|
else:
|
|
self.globalNameToAliases[fullName] = aliases
|
|
|
|
def parsePropertyValueAliasesFile(self, file):
|
|
for line in file:
|
|
line = line.split('#', 1)[0]
|
|
line = line.rstrip()
|
|
if (not len(line)):
|
|
continue
|
|
|
|
fields = line.split(';')
|
|
if (not fields):
|
|
continue
|
|
|
|
propertyType = fields[0].strip()
|
|
|
|
if propertyType == "gc":
|
|
mapToModify = self.generalCategoryToAliases
|
|
reverseMapToModify = self.aliasToGeneralCategory
|
|
elif propertyType == "sc":
|
|
mapToModify = self.scriptToAliases
|
|
reverseMapToModify = self.aliasToScript
|
|
else:
|
|
continue
|
|
|
|
primaryAlias = fields[1].strip()
|
|
fullName = fields[2].strip()
|
|
aliases = [primaryAlias]
|
|
for otherAlias in fields[3:]:
|
|
aliases.append(otherAlias.strip())
|
|
|
|
if fullName in mapToModify:
|
|
print("Error, already an {} alias for {}".format(propertyType, fullName))
|
|
else:
|
|
mapToModify[fullName] = aliases
|
|
if reverseMapToModify != None:
|
|
reverseMapToModify[primaryAlias] = fullName
|
|
|
|
def globalAliasesFor(self, name):
|
|
if name not in self.globalNameToAliases:
|
|
return []
|
|
return self.globalNameToAliases[name]
|
|
|
|
def generalCategoryAliasesFor(self, name):
|
|
if name not in self.generalCategoryToAliases:
|
|
return ""
|
|
return self.generalCategoryToAliases[name]
|
|
|
|
def generalCategoryForAlias(self, name):
|
|
if name not in self.aliasToGeneralCategory:
|
|
return ""
|
|
return self.aliasToGeneralCategory[name]
|
|
|
|
def scriptAliasesFor(self, name):
|
|
if name not in self.scriptToAliases:
|
|
return ""
|
|
return self.scriptToAliases[name]
|
|
|
|
def scriptNameForAlias(self, name):
|
|
if name not in self.aliasToScript:
|
|
return ""
|
|
return self.aliasToScript[name]
|
|
|
|
|
|
class PropertyData:
|
|
allPropertyData = []
|
|
|
|
def __init__(self, name):
|
|
self.name = name
|
|
self.aliases = []
|
|
self.index = len(PropertyData.allPropertyData)
|
|
self.hasBMPCharacters = False
|
|
self.hasNonBMPCharacters = False
|
|
self.matches = []
|
|
self.ranges = []
|
|
self.unicodeMatches = []
|
|
self.unicodeRanges = []
|
|
self.codePointCount = 0
|
|
PropertyData.allPropertyData.append(self)
|
|
|
|
def setAliases(self, aliases):
|
|
self.aliases = aliases
|
|
|
|
def makeCopy(self):
|
|
result = copy.deepcopy(self)
|
|
result.index = len(PropertyData.allPropertyData)
|
|
PropertyData.allPropertyData.append(result)
|
|
return result
|
|
|
|
def getIndex(self):
|
|
return self.index
|
|
|
|
def getCreateFuncName(self):
|
|
return "createCharacterClass{}".format(self.index)
|
|
|
|
def addMatch(self, codePoint):
|
|
if codePoint <= MaxBMP:
|
|
self.hasBMPCharacters = True
|
|
else:
|
|
self.hasNonBMPCharacters = True
|
|
if codePoint <= lastASCIICodePoint:
|
|
if (len(self.matches) and self.matches[-1] > codePoint) or (len(self.ranges) and self.ranges[-1][1] > codePoint):
|
|
self.addMatchUnordered(codePoint)
|
|
return
|
|
|
|
self.codePointCount = self.codePointCount + 1
|
|
if len(self.matches) and self.matches[-1] == (codePoint - 1):
|
|
lowCodePoint = self.matches.pop()
|
|
self.ranges.append((lowCodePoint, codePoint))
|
|
elif len(self.ranges) and self.ranges[-1][1] == (codePoint - 1):
|
|
priorRange = self.ranges.pop()
|
|
self.ranges.append((priorRange[0], codePoint))
|
|
else:
|
|
self.matches.append(codePoint)
|
|
else:
|
|
if (len(self.unicodeMatches) and self.unicodeMatches[-1] > codePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > codePoint):
|
|
self.addMatchUnordered(codePoint)
|
|
return
|
|
|
|
self.codePointCount = self.codePointCount + 1
|
|
if len(self.unicodeMatches) and self.unicodeMatches[-1] == (codePoint - 1):
|
|
lowCodePoint = self.unicodeMatches.pop()
|
|
self.unicodeRanges.append((lowCodePoint, codePoint))
|
|
elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (codePoint - 1):
|
|
priorRange = self.unicodeRanges.pop()
|
|
self.unicodeRanges.append((priorRange[0], codePoint))
|
|
else:
|
|
self.unicodeMatches.append(codePoint)
|
|
|
|
def addRange(self, lowCodePoint, highCodePoint):
|
|
if lowCodePoint <= MaxBMP:
|
|
self.hasBMPCharacters = True
|
|
if highCodePoint > MaxBMP:
|
|
self.hasNonBMPCharacters = True
|
|
if highCodePoint <= lastASCIICodePoint:
|
|
if (len(self.matches) and self.matches[-1] > lowCodePoint) or (len(self.ranges) and self.ranges[-1][1] > lowCodePoint):
|
|
self.addRangeUnordered(lowCodePoint, highCodePoint)
|
|
return
|
|
|
|
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
|
|
if len(self.matches) and self.matches[-1] == (lowCodePoint - 1):
|
|
lowCodePoint = self.matches.pop()
|
|
elif len(self.ranges) and self.ranges[-1][1] == (lowCodePoint - 1):
|
|
priorRange = self.ranges.pop()
|
|
lowCodePoint = priorRange[0]
|
|
self.ranges.append((lowCodePoint, highCodePoint))
|
|
elif lowCodePoint <= lastASCIICodePoint:
|
|
if lowCodePoint == lastASCIICodePoint:
|
|
self.addMatch(lowCodePoint)
|
|
else:
|
|
self.addRange(lowCodePoint, lastASCIICodePoint)
|
|
if highCodePoint == firstUnicodeCodePoint:
|
|
self.addMatch(highCodePoint)
|
|
else:
|
|
self.addRange(firstUnicodeCodePoint, highCodePoint)
|
|
else:
|
|
if (len(self.unicodeMatches) and self.unicodeMatches[-1] > lowCodePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > lowCodePoint):
|
|
self.addRangeUnordered(lowCodePoint, highCodePoint)
|
|
return
|
|
|
|
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
|
|
if len(self.unicodeMatches) and self.unicodeMatches[-1] == (lowCodePoint - 1):
|
|
lowCodePoint = self.unicodeMatches.pop()
|
|
self.unicodeRanges.append((lowCodePoint, highCodePoint))
|
|
elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (lowCodePoint - 1):
|
|
priorRange = self.unicodeRanges.pop()
|
|
self.unicodeRanges.append((priorRange[0], highCodePoint))
|
|
else:
|
|
self.unicodeRanges.append((lowCodePoint, highCodePoint))
|
|
|
|
def addMatchUnorderedForMatchesAndRanges(self, codePoint, matches, ranges):
|
|
if codePoint in matches:
|
|
return
|
|
insertLocation = None
|
|
lowCodePoint = None
|
|
highCodePoint = None
|
|
for idx in range(len(matches)):
|
|
match = matches[idx]
|
|
if codePoint == match + 1:
|
|
lowCodePoint = match
|
|
if idx < (len(matches) - 1) and codePoint == matches[idx + 1] - 1:
|
|
highCodePoint = matches[idx + 1]
|
|
del matches[idx + 1]
|
|
self.codePointCount = self.codePointCount - 1
|
|
else:
|
|
highCodePoint = codePoint
|
|
del matches[idx]
|
|
self.codePointCount = self.codePointCount - 1
|
|
break
|
|
elif codePoint == match - 1:
|
|
lowCodePoint = codePoint
|
|
highCodePoint = match
|
|
del matches[idx]
|
|
self.codePointCount = self.codePointCount - 1
|
|
break
|
|
elif codePoint < match:
|
|
insertLocation = idx
|
|
break
|
|
|
|
if insertLocation is None:
|
|
insertLocation = len(matches)
|
|
if lowCodePoint is None:
|
|
lowCodePoint = codePoint
|
|
highCodePoint = codePoint
|
|
|
|
for idx in range(len(ranges)):
|
|
cur_range = ranges[idx]
|
|
if lowCodePoint >= cur_range[0] and highCodePoint <= cur_range[1]:
|
|
return
|
|
if lowCodePoint <= (cur_range[1] + 1) and highCodePoint >= (cur_range[0] - 1):
|
|
while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1):
|
|
cur_range = ranges[idx]
|
|
lowCodePoint = min(lowCodePoint, cur_range[0])
|
|
highCodePoint = max(highCodePoint, cur_range[1])
|
|
del ranges[idx]
|
|
self.codePointCount = self.codePointCount - (cur_range[1] - cur_range[0]) - 1
|
|
|
|
ranges.insert(idx, (lowCodePoint, highCodePoint))
|
|
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
|
|
return
|
|
elif highCodePoint < cur_range[0]:
|
|
if lowCodePoint != highCodePoint:
|
|
ranges.insert(idx, (lowCodePoint, highCodePoint))
|
|
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
|
|
return
|
|
break
|
|
|
|
if lowCodePoint != highCodePoint:
|
|
ranges.append((lowCodePoint, highCodePoint))
|
|
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
|
|
return
|
|
|
|
if insertLocation is not None:
|
|
matches.insert(insertLocation, codePoint)
|
|
self.codePointCount = self.codePointCount + 1
|
|
|
|
def addRangeUnorderedForMatchesAndRanges(self, lowCodePoint, highCodePoint, matches, ranges):
|
|
if len(matches) and highCodePoint >= matches[0] and lowCodePoint <= matches[-1]:
|
|
for idx in range(len(matches)):
|
|
match = matches[idx]
|
|
if lowCodePoint <= match and highCodePoint >= match:
|
|
while idx < len(matches) and highCodePoint >= matches[idx]:
|
|
del matches[idx]
|
|
self.codePointCount = self.codePointCount - 1
|
|
if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1:
|
|
highCodePoint = matches[idx + 1]
|
|
del matches[idx + 1]
|
|
self.codePointCount = self.codePointCount - 1
|
|
break
|
|
elif lowCodePoint == match + 1:
|
|
lowCodePoint = match
|
|
while idx < len(matches) and highCodePoint >= matches[idx]:
|
|
del matches[idx]
|
|
self.codePointCount = self.codePointCount - 1
|
|
|
|
if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1:
|
|
highCodePoint = matches[idx + 1]
|
|
del matches[idx + 1]
|
|
self.codePointCount = self.codePointCount - 1
|
|
break
|
|
elif highCodePoint == match - 1:
|
|
highCodePoint = match
|
|
del matches[idx]
|
|
self.codePointCount = self.codePointCount - 1
|
|
break
|
|
elif highCodePoint < match:
|
|
break
|
|
|
|
for idx in range(len(ranges)):
|
|
cur_range = ranges[idx]
|
|
if lowCodePoint >= cur_range[0] and highCodePoint <= cur_range[1]:
|
|
return
|
|
if lowCodePoint <= (cur_range[1] + 1) and highCodePoint >= (cur_range[0] - 1):
|
|
while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1):
|
|
cur_range = ranges[idx]
|
|
lowCodePoint = min(lowCodePoint, cur_range[0])
|
|
highCodePoint = max(highCodePoint, cur_range[1])
|
|
del ranges[idx]
|
|
self.codePointCount = self.codePointCount - (cur_range[1] - cur_range[0]) - 1
|
|
|
|
ranges.insert(idx, (lowCodePoint, highCodePoint))
|
|
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
|
|
return
|
|
elif highCodePoint < cur_range[0]:
|
|
ranges.insert(idx, (lowCodePoint, highCodePoint))
|
|
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
|
|
return
|
|
|
|
ranges.append((lowCodePoint, highCodePoint))
|
|
self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
|
|
|
|
def addMatchUnordered(self, codePoint):
|
|
if codePoint <= lastASCIICodePoint:
|
|
self.addMatchUnorderedForMatchesAndRanges(codePoint, self.matches, self.ranges)
|
|
else:
|
|
self.addMatchUnorderedForMatchesAndRanges(codePoint, self.unicodeMatches, self.unicodeRanges)
|
|
|
|
def addRangeUnordered(self, lowCodePoint, highCodePoint):
|
|
if highCodePoint <= lastASCIICodePoint:
|
|
self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.matches, self.ranges)
|
|
elif lowCodePoint >= firstUnicodeCodePoint:
|
|
self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges)
|
|
else:
|
|
if lowCodePoint == lastASCIICodePoint:
|
|
self.addMatchUnorderedForMatchesAndRanges(lowCodePoint, self.matches, self.ranges)
|
|
else:
|
|
self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, lastASCIICodePoint, self.unicodeMatches, self.ranges)
|
|
if highCodePoint == firstUnicodeCodePoint:
|
|
self.addMatchUnorderedForMatchesAndRanges(highCodePoint, self.unicodeMatches, self.unicodeRanges)
|
|
else:
|
|
self.addRangeUnorderedForMatchesAndRanges(firstUnicodeCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges)
|
|
|
|
def removeMatchFromRanges(self, codePoint, ranges):
|
|
for idx in range(len(ranges)):
|
|
cur_range = ranges[idx]
|
|
if cur_range[0] <= codePoint and codePoint <= cur_range[1]:
|
|
ranges.pop(idx)
|
|
if cur_range[0] < codePoint and codePoint < cur_range[1]:
|
|
lowRange = (cur_range[0], codePoint - 1)
|
|
highRange = (codePoint + 1, cur_range[1])
|
|
# Since list.insert inserts before the index given, handle the high range first
|
|
if highRange[0] == highRange[1]:
|
|
self.addMatchUnordered(highRange[0])
|
|
else:
|
|
ranges.insert(idx, highRange)
|
|
if lowRange[0] == lowRange[1]:
|
|
self.addMatchUnordered(lowRange[0])
|
|
else:
|
|
ranges.insert(idx, lowRange)
|
|
else:
|
|
if cur_range[0] == codePoint:
|
|
cur_range = (codePoint + 1, cur_range[1])
|
|
else:
|
|
cur_range = (cur_range[0], codePoint - 1)
|
|
if cur_range[0] == cur_range[1]:
|
|
self.addMatchUnordered(cur_range[0])
|
|
else:
|
|
ranges.insert(idx, cur_range)
|
|
self.codePointCount = self.codePointCount - 1
|
|
return
|
|
|
|
def removeMatch(self, codePoint):
|
|
if codePoint <= lastASCIICodePoint:
|
|
if codePoint in self.matches:
|
|
self.matches.remove(codePoint)
|
|
self.codePointCount = self.codePointCount - 1
|
|
else:
|
|
self.removeMatchFromRanges(codePoint, self.ranges)
|
|
else:
|
|
if codePoint in self.unicodeMatches:
|
|
self.unicodeMatches.remove(codePoint)
|
|
self.codePointCount = self.codePointCount - 1
|
|
else:
|
|
self.removeMatchFromRanges(codePoint, self.unicodeRanges)
|
|
|
|
def dumpMatchData(self, file, valuesPerLine, dataList, formatter):
|
|
valuesThisLine = 0
|
|
firstValue = True
|
|
|
|
file.write("{")
|
|
for elem in dataList:
|
|
if firstValue:
|
|
firstValue = False
|
|
else:
|
|
file.write(", ")
|
|
valuesThisLine = valuesThisLine + 1
|
|
if valuesThisLine > valuesPerLine:
|
|
file.write("\n ")
|
|
valuesThisLine = 1
|
|
formatter(file, elem)
|
|
file.write("}")
|
|
|
|
def dump(self, file, commaAfter):
|
|
file.write("static std::unique_ptr<CharacterClass> {}()\n{{\n".format(self.getCreateFuncName()))
|
|
file.write(" // Name = {}, number of codePoints: {}\n".format(self.name, self.codePointCount))
|
|
file.write(" auto characterClass = makeUnique<CharacterClass>(\n")
|
|
file.write(" std::initializer_list<UChar32>(")
|
|
self.dumpMatchData(file, 8, self.matches, lambda file, match: (file.write("{0:0=#4x}".format(match))))
|
|
file.write("),\n")
|
|
file.write(" std::initializer_list<CharacterRange>(")
|
|
self.dumpMatchData(file, 4, self.ranges, lambda file, range: (file.write("{{{0:0=#4x}, {1:0=#4x}}}".format(range[0], range[1]))))
|
|
file.write("),\n")
|
|
file.write(" std::initializer_list<UChar32>(")
|
|
self.dumpMatchData(file, 8, self.unicodeMatches, lambda file, match: (file.write("{0:0=#6x}".format(match))))
|
|
file.write("),\n")
|
|
file.write(" std::initializer_list<CharacterRange>(")
|
|
self.dumpMatchData(file, 4, self.unicodeRanges, lambda file, range: (file.write("{{{0:0=#6x}, {1:0=#6x}}}".format(range[0], range[1]))))
|
|
file.write("),\n")
|
|
|
|
file.write(" CharacterClassWidths::{});\n".format(("Unknown", "HasBMPChars", "HasNonBMPChars", "HasBothBMPAndNonBMP")[int(self.hasNonBMPCharacters) * 2 + int(self.hasBMPCharacters)]))
|
|
file.write(" return characterClass;\n}\n\n")
|
|
|
|
@classmethod
|
|
def dumpAll(cls, file):
|
|
for propertyData in cls.allPropertyData:
|
|
propertyData.dump(file, propertyData != cls.allPropertyData[-1])
|
|
|
|
file.write("using CreateCharacterClass = std::unique_ptr<CharacterClass> (*)();\n")
|
|
file.write("static CreateCharacterClass createFunctions[{}] = {{\n ".format(len(cls.allPropertyData)))
|
|
functionsOnThisLine = 0
|
|
for propertyData in cls.allPropertyData:
|
|
file.write(" {},".format(propertyData.getCreateFuncName()))
|
|
functionsOnThisLine = functionsOnThisLine + 1
|
|
if functionsOnThisLine == 4:
|
|
file.write("\n ")
|
|
functionsOnThisLine = 0
|
|
|
|
file.write("};\n\n")
|
|
|
|
@classmethod
|
|
def createAndDumpHashTable(self, file, propertyDict, tablePrefix):
|
|
propertyKeys = propertyDict.keys()
|
|
numberOfKeys = len(propertyKeys)
|
|
hashSize = ceilingToPowerOf2(numberOfKeys * 2)
|
|
hashMask = hashSize - 1
|
|
hashTable = [None] * hashSize
|
|
valueTable = []
|
|
tableSize = hashSize
|
|
|
|
keyValuesToHash = []
|
|
for propertyName in propertyKeys:
|
|
propertyData = propertyDict[propertyName]
|
|
keyValuesToHash.append((propertyName, propertyData.getIndex()))
|
|
for alias in propertyData.aliases:
|
|
keyValuesToHash.append((alias, propertyData.getIndex()))
|
|
|
|
for keyValue in keyValuesToHash:
|
|
key = keyValue[0]
|
|
hash = stringHash(key) % hashSize
|
|
while hashTable[hash] is not None:
|
|
if hashTable[hash][1] is not None:
|
|
hash = hashTable[hash][1]
|
|
else:
|
|
hashTable[hash] = (hashTable[hash][0], tableSize)
|
|
hashTable.append(None)
|
|
hash = tableSize
|
|
tableSize = tableSize + 1
|
|
|
|
hashTable[hash] = (len(valueTable), None)
|
|
valueTable.append((key, keyValue[1]))
|
|
|
|
file.write("static const struct HashIndex {}TableIndex[{}] = {{\n".format(tablePrefix, len(hashTable)))
|
|
|
|
for tableIndex in hashTable:
|
|
value = -1
|
|
next = -1
|
|
if tableIndex is not None:
|
|
value = tableIndex[0]
|
|
if tableIndex[1] is not None:
|
|
next = tableIndex[1]
|
|
|
|
file.write(" {{ {}, {} }},\n".format(value, next))
|
|
|
|
file.write("};\n\n")
|
|
|
|
file.write("static const struct HashValue {}TableValue[{}] = {{\n".format(tablePrefix, len(valueTable)))
|
|
for value in valueTable:
|
|
file.write(" {{ \"{}\", {} }},\n".format(value[0], value[1]))
|
|
file.write("};\n\n")
|
|
|
|
file.write("static const struct HashTable {}HashTable = \n".format(tablePrefix))
|
|
file.write(" {{ {}, {}, {}TableValue, {}TableIndex }};\n\n".format(len(valueTable), hashMask, tablePrefix, tablePrefix))
|
|
|
|
|
|
class Scripts:
|
|
def __init__(self):
|
|
self.allPropertyData = []
|
|
self.scriptsByName = {}
|
|
self.scriptExtensionsByName = {}
|
|
self.unknownScript = PropertyData("Unknown")
|
|
self.unknownScript.setAliases(aliases.scriptAliasesFor("Unknown"))
|
|
self.allPropertyData.append(self.unknownScript)
|
|
self.scriptsParsed = False
|
|
|
|
def parseScriptsFile(self, file):
|
|
currentScriptName = None
|
|
currentPropertyData = None
|
|
# To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges,
|
|
# sort them, and then go the list to create the inverse of the assigned ranges.
|
|
assignedCodePointRanges = []
|
|
|
|
for line in file:
|
|
line = line.split('#', 1)[0]
|
|
line = line.rstrip()
|
|
if (not len(line)):
|
|
continue
|
|
|
|
fields = line.split(';')
|
|
if (not fields):
|
|
continue
|
|
|
|
codePoints = fields[0].strip()
|
|
scriptName = fields[1].strip()
|
|
|
|
if scriptName != currentScriptName:
|
|
currentScriptName = scriptName
|
|
currentPropertyData = PropertyData(scriptName)
|
|
currentPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
|
|
self.allPropertyData.append(currentPropertyData)
|
|
self.scriptsByName[scriptName] = currentPropertyData
|
|
|
|
dotDot = codePoints.find("..")
|
|
if dotDot == -1:
|
|
codePoint = int(codePoints, 16)
|
|
currentPropertyData.addMatch(codePoint)
|
|
assignedCodePointRanges.append((codePoint, codePoint))
|
|
else:
|
|
lowCodePoint = int(codePoints[:dotDot], 16)
|
|
highCodePoint = int(codePoints[dotDot + 2:], 16)
|
|
currentPropertyData.addRange(lowCodePoint, highCodePoint)
|
|
assignedCodePointRanges.append((lowCodePoint, highCodePoint))
|
|
|
|
assignedCodePointRanges.sort(key=lambda range: range[0])
|
|
lastAssignedCodePoint = 0
|
|
|
|
for range in assignedCodePointRanges:
|
|
if range[0] - lastAssignedCodePoint > 1:
|
|
if range[0] - lastAssignedCodePoint == 2:
|
|
self.unknownScript.addMatch(lastAssignedCodePoint + 1)
|
|
else:
|
|
self.unknownScript.addRange(lastAssignedCodePoint + 1, range[0] - 1)
|
|
lastAssignedCodePoint = range[1]
|
|
|
|
if lastAssignedCodePoint < MaxUnicode:
|
|
if MaxUnicode - lastAssignedCodePoint == 1:
|
|
self.unknownScript.addMatch(MaxUnicode)
|
|
else:
|
|
self.unknownScript.addRange(lastAssignedCodePoint + 1, MaxUnicode)
|
|
|
|
self.scriptsParsed = True
|
|
|
|
def parseScriptExtensionsFile(self, file):
|
|
currentPropertyData = None
|
|
# To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges,
|
|
# sort them, and then go the list to create the inverse of the assigned ranges.
|
|
assignedCodePointRanges = []
|
|
|
|
if not self.scriptsParsed:
|
|
print("Error: parsing ScriptExtensions.txt before Scripts.txt")
|
|
exit(1)
|
|
|
|
commonScriptExtenstionPropertyData = None
|
|
inheritedScriptExtensionPropertyData = None
|
|
|
|
scriptName = "Common"
|
|
if scriptName in self.scriptsByName:
|
|
commonScriptExtenstionPropertyData = self.scriptsByName[scriptName].makeCopy()
|
|
else:
|
|
commonScriptExtenstionPropertyData = PropertyData(scriptName)
|
|
commonScriptExtenstionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
|
|
self.allPropertyData.append(commonScriptExtenstionPropertyData)
|
|
self.scriptExtensionsByName[scriptName] = commonScriptExtenstionPropertyData
|
|
|
|
scriptName = "Inherited"
|
|
if scriptName in self.scriptsByName:
|
|
inheritedScriptExtensionPropertyData = self.scriptsByName[scriptName].makeCopy()
|
|
else:
|
|
inheritedScriptExtensionPropertyData = PropertyData(scriptName)
|
|
inheritedScriptExtensionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
|
|
self.allPropertyData.append(inheritedScriptExtensionPropertyData)
|
|
self.scriptExtensionsByName[scriptName] = inheritedScriptExtensionPropertyData
|
|
|
|
for line in file:
|
|
line = line.split('#', 1)[0]
|
|
line = line.rstrip()
|
|
if (not len(line)):
|
|
continue
|
|
|
|
fields = line.split(';')
|
|
if (not fields):
|
|
continue
|
|
|
|
codePoints = fields[0].strip()
|
|
scriptAliasList = fields[1].strip()
|
|
|
|
for scriptAlias in scriptAliasList.split(' '):
|
|
scriptName = aliases.scriptNameForAlias(scriptAlias)
|
|
currentPropertyData = None
|
|
|
|
if scriptName not in self.scriptExtensionsByName:
|
|
currentPropertyData = self.scriptsByName[scriptName].makeCopy()
|
|
self.allPropertyData.append(currentPropertyData)
|
|
self.scriptExtensionsByName[scriptName] = currentPropertyData
|
|
else:
|
|
currentPropertyData = self.scriptExtensionsByName[scriptName]
|
|
|
|
dotDot = codePoints.find("..")
|
|
if dotDot == -1:
|
|
codePoint = int(codePoints, 16)
|
|
currentPropertyData.addMatch(codePoint)
|
|
commonScriptExtenstionPropertyData.removeMatch(codePoint)
|
|
inheritedScriptExtensionPropertyData.removeMatch(codePoint)
|
|
else:
|
|
lowCodePoint = int(codePoints[:dotDot], 16)
|
|
highCodePoint = int(codePoints[dotDot + 2:], 16)
|
|
currentPropertyData.addRange(lowCodePoint, highCodePoint)
|
|
for codePoint in range(lowCodePoint, highCodePoint + 1):
|
|
commonScriptExtenstionPropertyData.removeMatch(codePoint)
|
|
inheritedScriptExtensionPropertyData.removeMatch(codePoint)
|
|
|
|
# For the scripts that don't have any additional extension codePoints, copy the script
|
|
# data to the script extension with the same name
|
|
for scriptName, propertyData in self.scriptsByName.items():
|
|
if scriptName not in self.scriptExtensionsByName:
|
|
self.scriptExtensionsByName[scriptName] = propertyData
|
|
|
|
def dump(self, file):
|
|
file.write("// Scripts:\n")
|
|
PropertyData.createAndDumpHashTable(file, self.scriptsByName, "script")
|
|
|
|
file.write("// Script_Extensions:\n")
|
|
PropertyData.createAndDumpHashTable(file, self.scriptExtensionsByName, "scriptExtension")
|
|
|
|
|
|
class GeneralCategory:
|
|
def __init__(self, file):
|
|
self.file = file
|
|
self.allPropertyData = []
|
|
self.propertyDataByCategory = {}
|
|
self.createSpecialPropertyData("Any", (0, MaxUnicode))
|
|
self.createSpecialPropertyData("ASCII", (0, lastASCIICodePoint))
|
|
self.assignedPropertyData = self.createSpecialPropertyData("Assigned")
|
|
self.unassignedProperyData = self.findPropertyGroupFor("Cn")[1]
|
|
self.casedLetterPropertyData = self.findPropertyGroupFor("LC")[1]
|
|
self.lastAddedCodePoint = 0
|
|
|
|
def createSpecialPropertyData(self, name, range=None):
|
|
propertyData = PropertyData(name)
|
|
self.allPropertyData.append(propertyData)
|
|
self.propertyDataByCategory[name] = propertyData
|
|
if range:
|
|
propertyData.addRange(range[0], range[1])
|
|
|
|
return propertyData
|
|
|
|
def findPropertyGroupFor(self, categoryAlias):
|
|
category = aliases.generalCategoryForAlias(categoryAlias)
|
|
allCategoryAliases = aliases.generalCategoryAliasesFor(category)
|
|
categoryGroupAlias = categoryAlias[0]
|
|
categoryGroup = aliases.generalCategoryForAlias(categoryGroupAlias)
|
|
allCategoryGroupAlias = aliases.generalCategoryAliasesFor(categoryGroup)
|
|
groupPropertyData = None
|
|
propertyData = None
|
|
|
|
if categoryGroup not in self.propertyDataByCategory:
|
|
groupPropertyData = PropertyData(categoryGroup)
|
|
groupPropertyData.setAliases(allCategoryGroupAlias)
|
|
self.allPropertyData.append(groupPropertyData)
|
|
self.propertyDataByCategory[categoryGroup] = groupPropertyData
|
|
else:
|
|
groupPropertyData = self.propertyDataByCategory[categoryGroup]
|
|
|
|
if category not in self.propertyDataByCategory:
|
|
propertyData = PropertyData(category)
|
|
propertyData.setAliases(allCategoryAliases)
|
|
self.allPropertyData.append(propertyData)
|
|
self.propertyDataByCategory[category] = propertyData
|
|
else:
|
|
propertyData = self.propertyDataByCategory[category]
|
|
|
|
return (groupPropertyData, propertyData)
|
|
|
|
def addNextCodePoints(self, categoryAlias, codePoint, highCodePoint=None):
|
|
if codePoint - self.lastAddedCodePoint > 1:
|
|
propertyDatas = self.findPropertyGroupFor("Cn")
|
|
if codePoint - self.lastAddedCodePoint == 2:
|
|
propertyDatas[0].addMatch(self.lastAddedCodePoint + 1)
|
|
propertyDatas[1].addMatch(self.lastAddedCodePoint + 1)
|
|
else:
|
|
propertyDatas[0].addRange(self.lastAddedCodePoint + 1, codePoint - 1)
|
|
propertyDatas[1].addRange(self.lastAddedCodePoint + 1, codePoint - 1)
|
|
|
|
propertyDatas = self.findPropertyGroupFor(categoryAlias)
|
|
if highCodePoint:
|
|
propertyDatas[0].addRange(codePoint, highCodePoint)
|
|
propertyDatas[1].addRange(codePoint, highCodePoint)
|
|
if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu":
|
|
self.casedLetterPropertyData.addRange(codePoint, highCodePoint)
|
|
self.assignedPropertyData.addRange(codePoint, highCodePoint)
|
|
|
|
self.lastAddedCodePoint = highCodePoint
|
|
else:
|
|
propertyDatas[0].addMatch(codePoint)
|
|
propertyDatas[1].addMatch(codePoint)
|
|
if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu":
|
|
self.casedLetterPropertyData.addMatch(codePoint)
|
|
self.assignedPropertyData.addMatch(codePoint)
|
|
|
|
self.lastAddedCodePoint = codePoint
|
|
|
|
def parse(self):
|
|
lastLineFirstOfRange = None
|
|
lastLineCodePoint = 0
|
|
for line in self.file:
|
|
line = line.split('#', 1)[0]
|
|
line = line.rstrip()
|
|
if (not len(line)):
|
|
continue
|
|
|
|
fields = line.split(';')
|
|
if (not fields):
|
|
continue
|
|
|
|
codePoint = int(fields[0].strip(), 16)
|
|
description = fields[1].strip()
|
|
categoryAlias = fields[2].strip()
|
|
|
|
if lastLineFirstOfRange:
|
|
if description[-5:-1] == "Last":
|
|
self.addNextCodePoints(categoryAlias, lastLineFirstOfRange, codePoint)
|
|
lastLineFirstOfRange = None
|
|
continue
|
|
else:
|
|
print("Malformed First..Last pair of lines")
|
|
|
|
if description[-6:-1] == "First":
|
|
lastLineFirstOfRange = codePoint
|
|
continue
|
|
|
|
self.addNextCodePoints(categoryAlias, codePoint)
|
|
|
|
if self.lastAddedCodePoint < MaxUnicode:
|
|
propertyDatas = self.findPropertyGroupFor("Cn")
|
|
if MaxUnicode - self.lastAddedCodePoint == 1:
|
|
propertyDatas[0].addMatch(MaxUnicode)
|
|
propertyDatas[1].addMatch(MaxUnicode)
|
|
else:
|
|
propertyDatas[0].addRange(self.lastAddedCodePoint + 1, MaxUnicode)
|
|
propertyDatas[1].addRange(self.lastAddedCodePoint + 1, MaxUnicode)
|
|
|
|
def dump(self, file):
|
|
file.write("// General_Category:\n")
|
|
PropertyData.createAndDumpHashTable(file, self.propertyDataByCategory, "generalCategory")
|
|
|
|
|
|
class BinaryProperty:
|
|
def __init__(self):
|
|
self.allPropertyData = []
|
|
self.propertyDataByProperty = {}
|
|
|
|
def parsePropertyFile(self, file):
|
|
currentPropertyName = None
|
|
currentPropertyData = None
|
|
|
|
for line in file:
|
|
line = line.split('#', 1)[0]
|
|
line = line.rstrip()
|
|
if (not len(line)):
|
|
continue
|
|
|
|
fields = line.split(';')
|
|
if (not fields):
|
|
continue
|
|
|
|
codePoints = fields[0].strip()
|
|
propertyName = fields[1].strip()
|
|
|
|
if propertyName != currentPropertyName:
|
|
if propertyName not in SupportedBinaryProperties:
|
|
continue
|
|
|
|
currentPropertyName = propertyName
|
|
currentPropertyData = PropertyData(propertyName)
|
|
currentPropertyData.setAliases(aliases.globalAliasesFor(propertyName))
|
|
self.allPropertyData.append(currentPropertyData)
|
|
self.propertyDataByProperty[propertyName] = currentPropertyData
|
|
|
|
dotDot = codePoints.find("..")
|
|
if dotDot == -1:
|
|
currentPropertyData.addMatch(int(codePoints, 16))
|
|
else:
|
|
currentPropertyData.addRange(int(codePoints[:dotDot], 16), int(codePoints[dotDot + 2:], 16))
|
|
|
|
def dump(self, file):
|
|
file.write("// binary properties:\n")
|
|
PropertyData.createAndDumpHashTable(file, self.propertyDataByProperty, "binaryProperty")
|
|
|
|
if __name__ == "__main__":
|
|
parser = optparse.OptionParser(usage="usage: %prog <UCD-Directory> <YarrUnicodePropertyData.h>")
|
|
(options, args) = parser.parse_args()
|
|
|
|
if len(args) != 2:
|
|
parser.error("<UCD-Directory> <YarrUnicodePropertyData.h>")
|
|
|
|
UCDDirectoryPath = args[0]
|
|
unicodeProertyDataHPath = args[1]
|
|
|
|
verifyUCDFilesExist()
|
|
|
|
propertyAliasesFile = openUCDFileOrExit("PropertyAliases.txt")
|
|
propertyValueAliasesFile = openUCDFileOrExit("PropertyValueAliases.txt")
|
|
scriptsFile = openUCDFileOrExit("Scripts.txt")
|
|
scriptExtensionsFile = openUCDFileOrExit("ScriptExtensions.txt")
|
|
unicodeDataFile = openUCDFileOrExit("UnicodeData.txt")
|
|
derivedBinaryPropertiesFile = openUCDFileOrExit("DerivedBinaryProperties.txt")
|
|
derivedCorePropertiesFile = openUCDFileOrExit("DerivedCoreProperties.txt")
|
|
derivedNormalizationPropertiesFile = openUCDFileOrExit("DerivedNormalizationProps.txt")
|
|
propListFile = openUCDFileOrExit("PropList.txt")
|
|
emojiDataFile = openUCDFileOrExit("emoji-data.txt")
|
|
|
|
aliases = Aliases()
|
|
|
|
propertyDataHFile = openOrExit(unicodeProertyDataHPath, "w")
|
|
|
|
propertyDataHFile.write(header)
|
|
|
|
aliases.parsePropertyAliasesFile(propertyAliasesFile)
|
|
aliases.parsePropertyValueAliasesFile(propertyValueAliasesFile)
|
|
|
|
generalCategory = GeneralCategory(unicodeDataFile)
|
|
generalCategory.parse()
|
|
|
|
binaryProperty = BinaryProperty()
|
|
binaryProperty.parsePropertyFile(derivedBinaryPropertiesFile)
|
|
binaryProperty.parsePropertyFile(derivedCorePropertiesFile)
|
|
binaryProperty.parsePropertyFile(derivedNormalizationPropertiesFile)
|
|
binaryProperty.parsePropertyFile(propListFile)
|
|
binaryProperty.parsePropertyFile(emojiDataFile)
|
|
|
|
scripts = Scripts()
|
|
scripts.parseScriptsFile(scriptsFile)
|
|
scripts.parseScriptExtensionsFile(scriptExtensionsFile)
|
|
|
|
PropertyData.dumpAll(propertyDataHFile)
|
|
generalCategory.dump(propertyDataHFile)
|
|
binaryProperty.dump(propertyDataHFile)
|
|
scripts.dump(propertyDataHFile)
|
|
|
|
propertyDataHFile.write(footer)
|
|
|
|
exit(0)
|