gecko-dev/intl/uconv/tools/gen-big5-data.py
Henri Sivonen b00ce35888 Bug 912470 part 2 - Implement Encoding Standard-compliant big5 encoder. r=emk.
--HG--
rename : intl/uconv/ucvtw/nsBIG5DecoderData.h => intl/uconv/ucvtw/nsBIG5Data.cpp
rename : testing/web-platform/tests/encoding/gbk-encoder.html => testing/web-platform/tests/encoding/big5-encoder.html
2015-09-03 15:21:57 +03:00

254 lines
5.4 KiB
Python

#!/usr/bin/python
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Adapted from
# https://hg.mozilla.org/projects/htmlparser/file/0d906fb1ab90/generate-encoding-data.py
# indexes.json comes from
# https://encoding.spec.whatwg.org/indexes.json
# i.e.
# https://github.com/whatwg/encoding/blob/ce4e83d0df5b5efec0697fc76e66699737e033a3/indexes.json
import json
indexes = json.load(open("indexes.json", "r"))
def nullToZero(codePoint):
if not codePoint:
codePoint = 0
return codePoint
index = []
for codePoint in indexes["big5"]:
index.append(nullToZero(codePoint))
# There are four major gaps consisting of more than 4 consecutive invalid pointers
gaps = []
consecutive = 0
consecutiveStart = 0
offset = 0
for codePoint in index:
if codePoint == 0:
if consecutive == 0:
consecutiveStart = offset
consecutive +=1
else:
if consecutive > 4:
gaps.append((consecutiveStart, consecutiveStart + consecutive))
consecutive = 0
offset += 1
def invertRanges(ranges, cap):
inverted = []
invertStart = 0
for (start, end) in ranges:
if start != 0:
inverted.append((invertStart, start))
invertStart = end
inverted.append((invertStart, cap))
return inverted
cap = len(index)
ranges = invertRanges(gaps, cap)
# Now compute a compressed lookup table for astralness
gaps = []
consecutive = 0
consecutiveStart = 0
offset = 0
for codePoint in index:
if codePoint <= 0xFFFF:
if consecutive == 0:
consecutiveStart = offset
consecutive +=1
else:
if consecutive > 40:
gaps.append((consecutiveStart, consecutiveStart + consecutive))
consecutive = 0
offset += 1
astralRanges = invertRanges(gaps, cap)
classFile = open("../ucvtw/nsBIG5Data.cpp", "w")
classFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
* Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
*/
#include "nsBIG5Data.h"
static const char16_t kBig5LowBitsTable[] = {
''')
for (low, high) in ranges:
for i in xrange(low, high):
classFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
classFile.write('''};
static const uint32_t kBig5AstralnessTable[] = {
''')
# An array of bool is inefficient per
# http://stackoverflow.com/questions/4049156/1-bit-per-bool-in-array-c
bits = []
for (low, high) in astralRanges:
for i in xrange(low, high):
bits.append(1 if index[i] > 0xFFFF else 0)
# pad length to multiple of 32
for i in xrange(32 - (len(bits) % 32)):
bits.append(0)
i = 0
while i < len(bits):
accu = 0
for j in xrange(32):
accu |= bits[i + j] << j
classFile.write(' 0x%08X,\n' % accu)
i += 32
classFile.write('''};
// static
char16_t
nsBIG5Data::LowBits(size_t aPointer)
{
''')
base = 0
for (low, high) in ranges:
classFile.write(''' if (aPointer < %d) {
return 0;
}
if (aPointer < %d) {
return kBig5LowBitsTable[%d + (aPointer - %d)];
}
''' % (low, high, base, low))
base += (high - low)
classFile.write(''' return 0;
}
// static
bool
nsBIG5Data::IsAstral(size_t aPointer)
{
''')
base = 0
for (low, high) in astralRanges:
if high - low == 1:
classFile.write(''' if (aPointer < %d) {
return false;
}
if (aPointer == %d) {
return true;
}
''' % (low, low))
else:
classFile.write(''' if (aPointer < %d) {
return false;
}
if (aPointer < %d) {
size_t index = %d + (aPointer - %d);
return kBig5AstralnessTable[index >> 5] & (1 << (index & 0x1F));
}
''' % (low, high, base, low))
base += (high - low)
classFile.write(''' return false;
}
//static
size_t
nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral)
{
if (!aIsAstral) {
switch (aLowBits) {
''')
hkscsBound = (0xA1 - 0x81) * 157
preferLast = [
0x2550,
0x255E,
0x2561,
0x256A,
0x5341,
0x5345,
]
for codePoint in preferLast:
# Python lists don't have .rindex() :-(
for i in xrange(len(index) - 1, -1, -1):
candidate = index[i]
if candidate == codePoint:
classFile.write(''' case 0x%04X:
return %d;
''' % (codePoint, i))
break
classFile.write(''' default:
break;
}
}''')
base = 0
start = 0
for (low, high) in ranges:
if low <= hkscsBound and hkscsBound < high:
# This is the first range we don't ignore and the
# range that contains the first non-HKSCS pointer.
# Avoid searching HKSCS.
start = base + hkscsBound - low
break
base += (high - low)
classFile.write('''
for (size_t i = %d; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) {
if (kBig5LowBitsTable[i] == aLowBits) {
size_t pointer;
''' % start)
base = 0
prevLow = 0
prevHigh = 0
prevBase = 0
writing = False
for (low, high) in ranges:
if writing:
classFile.write('''if (i < %d) {
pointer = i + %d;
} else ''' % ((prevBase + prevHigh - prevLow), (prevLow - prevBase)))
prevLow = low
prevHigh = high
prevBase = base
if high > hkscsBound:
writing = True
base += (high - low)
classFile.write('''{
pointer = i + %d;
}''' % (prevLow - prevBase))
classFile.write('''
if (aIsAstral == IsAstral(pointer)) {
return pointer;
}
}
}
return 0;
}
''')
classFile.close()