Bug 1513934 - Import ICU patch to fix possible time zone misdetection on Windows 7. r=Waldo

--HG--
extra : amend_source : 9b0b4e865d1195b7124f74ea096d904a31b0cd71
This commit is contained in:
André Bargull 2018-12-14 05:48:46 -08:00
parent f5d8300d24
commit 7f8b902c81
17 changed files with 770 additions and 547 deletions

View File

@ -0,0 +1,42 @@
From 9a2c52d1744abaa57defc5f2fb25927ae16a3a0e Mon Sep 17 00:00:00 2001
From: Jeff Genovy <29107334+jefgen@users.noreply.github.com>
Date: Wed, 12 Dec 2018 19:42:48 -0800
Subject: [PATCH] ICU-20302 Timezone detection fails on Windows 7. Also add a
test case for Windows time zone detection failing.
---
icu4c/source/common/wintz.cpp | 6 +++---
icu4c/source/test/cintltst/putiltst.c | 11 +++++++++++
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/intl/icu/source/common/wintz.cpp b/intl/icu/source/common/wintz.cpp
index 5e9ac0d2f37..8a143d9e782 100644
--- a/intl/icu/source/common/wintz.cpp
+++ b/intl/icu/source/common/wintz.cpp
@@ -35,7 +35,7 @@
U_NAMESPACE_BEGIN
-// The value of MAX_TIMEZONE_ID_LENGTH is 128, which is defined in DYNAMIC_TIME_ZONE_INFORMATION
+// The max size of TimeZoneKeyName is 128, defined in DYNAMIC_TIME_ZONE_INFORMATION
#define MAX_TIMEZONE_ID_LENGTH 128
/**
@@ -44,7 +44,7 @@ U_NAMESPACE_BEGIN
* Note: We use the Win32 API GetDynamicTimeZoneInformation to get the current time zone info.
* This API returns a non-localized time zone name, which we can then map to an ICU time zone name.
*/
-U_CFUNC const char* U_EXPORT2
+U_INTERNAL const char* U_EXPORT2
uprv_detectWindowsTimeZone()
{
UErrorCode status = U_ZERO_ERROR;
@@ -79,7 +79,7 @@ uprv_detectWindowsTimeZone()
// convert from wchar_t* (UTF-16 on Windows) to char* (UTF-8).
u_strToUTF8(dynamicTZKeyName, UPRV_LENGTHOF(dynamicTZKeyName), nullptr,
- reinterpret_cast<const UChar*>(dynamicTZI.TimeZoneKeyName), UPRV_LENGTHOF(dynamicTZI.TimeZoneKeyName), &status);
+ reinterpret_cast<const UChar*>(dynamicTZI.TimeZoneKeyName), -1, &status);
if (U_FAILURE(status)) {
return nullptr;

View File

@ -0,0 +1,22 @@
From 3c644c62c71c890424ef5d20caa2f9dc354e02d6 Mon Sep 17 00:00:00 2001
From: Jeff Genovy <Jeff.Genovy@microsoft.com>
Date: Fri, 14 Dec 2018 00:56:51 -0800
Subject: [PATCH] ICU-20302 Fix wintz header file. (Thanks to Jungshik).
---
icu4c/source/common/wintz.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/intl/icu/source/common/wintz.h b/intl/icu/source/common/wintz.h
index f98b1779b5d..cd8565eef1e 100644
--- a/intl/icu/source/common/wintz.h
+++ b/intl/icu/source/common/wintz.h
@@ -28,7 +28,7 @@ U_CDECL_BEGIN
typedef struct _TIME_ZONE_INFORMATION TIME_ZONE_INFORMATION;
U_CDECL_END
-U_CFUNC const char* U_EXPORT2
+U_INTERNAL const char* U_EXPORT2
uprv_detectWindowsTimeZone();
#endif /* U_PLATFORM_USES_ONLY_WIN32_API */

View File

@ -1,7 +1,14 @@
commit 6cbd62e59e30f73b444be89ea71fd74275ac53a4
Author: Shane Carr <shane@unicode.org>
Date: Mon Oct 29 23:52:44 2018 -0700
commit f3fa0d604ef6527a01dab96f4bfa3c5290127337
Author: Markus Scherer <markus.icu@gmail.com>
Date: Fri Nov 9 12:54:22 2018 -0800
ICU-20246 Fixing another integer overflow in number parsing.
ICU-20250 make UnicodeSet(intprop=value) faster
- fastpath for UnicodeSet.add(new last range)
- fewer UnicodeSet memory allocations:
initial internal list array, exponential array growth,
allocate strings list/set only when first one is added
- faster CodePointTrie.getRange(): fewer calls to filter function
- revert UnicodeSet(intprop=value) from trie ranges to range starts + lookup
- cache per-int-prop range starts: fewer lookups
(cherry picked from commit 53d8c8f3d181d87a6aa925b449b51c4a2c922a51)
(cherry picked from commit 98f9170004c29388d756a8a283573164a7a26bef)

View File

@ -23,6 +23,9 @@
#include "umutex.h"
#include "uprops.h"
using icu::LocalPointer;
using icu::Normalizer2Factory;
using icu::Normalizer2Impl;
using icu::UInitOnce;
using icu::UnicodeSet;
@ -30,11 +33,13 @@ namespace {
UBool U_CALLCONV characterproperties_cleanup();
constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
struct Inclusion {
UnicodeSet *fSet;
UInitOnce fInitOnce;
};
Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
@ -80,35 +85,22 @@ UBool U_CALLCONV characterproperties_cleanup() {
return TRUE;
}
} // namespace
U_NAMESPACE_BEGIN
/*
Reduce excessive reallocation, and make it easier to detect initialization problems.
Usually you don't see smaller sets than this for Unicode 5.0.
*/
constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072;
void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) {
void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
// This function is a friend of class UnicodeSet.
U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
if (src == UPROPS_SRC_NONE) {
errorCode = U_INTERNAL_PROGRAM_ERROR;
return;
}
UnicodeSet * &incl = gInclusions[src].fSet;
U_ASSERT(incl == nullptr);
U_ASSERT(gInclusions[src].fSet == nullptr);
incl = new UnicodeSet();
if (incl == nullptr) {
LocalPointer<UnicodeSet> incl(new UnicodeSet());
if (incl.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
USetAdder sa = {
(USet *)incl,
(USet *)incl.getAlias(),
_set_add,
_set_addRange,
_set_addString,
@ -116,7 +108,6 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo
nullptr // don't need removeRange()
};
incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode);
switch(src) {
case UPROPS_SRC_CHAR:
uchar_addPropertyStarts(&sa, &errorCode);
@ -183,12 +174,15 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo
}
if (U_FAILURE(errorCode)) {
delete incl;
incl = nullptr;
return;
}
// Compact for caching
if (incl->isBogus()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Compact for caching.
incl->compact();
gInclusions[src].fSet = incl.orphan();
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
}
@ -199,15 +193,66 @@ const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorC
return nullptr;
}
Inclusion &i = gInclusions[src];
umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode);
umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
return i.fSet;
}
void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
UPropertySource src = uprops_getSource(prop);
const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
if (U_FAILURE(errorCode)) {
return;
}
LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
if (intPropIncl.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t numRanges = incl->getRangeCount();
int32_t prevValue = 0;
for (int32_t i = 0; i < numRanges; ++i) {
UChar32 rangeEnd = incl->getRangeEnd(i);
for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
int32_t value = u_getIntPropertyValue(c, prop);
if (value != prevValue) {
intPropIncl->add(c);
prevValue = value;
}
}
}
if (intPropIncl->isBogus()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Compact for caching.
intPropIncl->compact();
gInclusions[inclIndex].fSet = intPropIncl.orphan();
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
}
} // namespace
U_NAMESPACE_BEGIN
const UnicodeSet *CharacterProperties::getInclusionsForProperty(
UProperty prop, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
UPropertySource src = uprops_getSource(prop);
return getInclusionsForSource(src, errorCode);
if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
Inclusion &i = gInclusions[inclIndex];
umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
return i.fSet;
} else {
UPropertySource src = uprops_getSource(prop);
return getInclusionsForSource(src, errorCode);
}
}
U_NAMESPACE_END
@ -216,7 +261,7 @@ namespace {
UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
icu::LocalPointer<UnicodeSet> set(new UnicodeSet());
LocalPointer<UnicodeSet> set(new UnicodeSet());
if (set.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return nullptr;

View File

@ -280,7 +280,7 @@ UChar32 getRange(const void *t, UChar32 start,
int32_t prevI3Block = -1;
int32_t prevBlock = -1;
UChar32 c = start;
uint32_t value;
uint32_t trieValue, value;
bool haveValue = false;
do {
int32_t i3Block;
@ -319,6 +319,7 @@ UChar32 getRange(const void *t, UChar32 start,
return c - 1;
}
} else {
trieValue = trie->nullValue;
value = nullValue;
if (pValue != nullptr) { *pValue = nullValue; }
haveValue = true;
@ -357,6 +358,7 @@ UChar32 getRange(const void *t, UChar32 start,
return c - 1;
}
} else {
trieValue = trie->nullValue;
value = nullValue;
if (pValue != nullptr) { *pValue = nullValue; }
haveValue = true;
@ -364,23 +366,32 @@ UChar32 getRange(const void *t, UChar32 start,
c = (c + dataBlockLength) & ~dataMask;
} else {
int32_t di = block + (c & dataMask);
uint32_t value2 = getValue(trie->data, valueWidth, di);
value2 = maybeFilterValue(value2, trie->nullValue, nullValue,
filter, context);
uint32_t trieValue2 = getValue(trie->data, valueWidth, di);
if (haveValue) {
if (value2 != value) {
return c - 1;
if (trieValue2 != trieValue) {
if (filter == nullptr ||
maybeFilterValue(trieValue2, trie->nullValue, nullValue,
filter, context) != value) {
return c - 1;
}
trieValue = trieValue2; // may or may not help
}
} else {
value = value2;
trieValue = trieValue2;
value = maybeFilterValue(trieValue2, trie->nullValue, nullValue,
filter, context);
if (pValue != nullptr) { *pValue = value; }
haveValue = true;
}
while ((++c & dataMask) != 0) {
if (maybeFilterValue(getValue(trie->data, valueWidth, ++di),
trie->nullValue, nullValue,
filter, context) != value) {
return c - 1;
trieValue2 = getValue(trie->data, valueWidth, ++di);
if (trieValue2 != trieValue) {
if (filter == nullptr ||
maybeFilterValue(trieValue2, trie->nullValue, nullValue,
filter, context) != value) {
return c - 1;
}
trieValue = trieValue2; // may or may not help
}
}
}

View File

@ -60,6 +60,7 @@ constexpr uint8_t I3_18 = 3;
constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8;
class AllSameBlocks;
class MixedBlocks;
class MutableCodePointTrie : public UMemory {
public:
@ -92,8 +93,10 @@ private:
void maskValues(uint32_t mask);
UChar32 findHighStart() const;
int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks);
int32_t compactData(int32_t fastILimit, uint32_t *newData, int32_t dataNullIndex);
int32_t compactIndex(int32_t fastILimit, UErrorCode &errorCode);
int32_t compactData(
int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode);
uint32_t *index = nullptr;
@ -301,41 +304,56 @@ UChar32 MutableCodePointTrie::getRange(
uint32_t nullValue = initialValue;
if (filter != nullptr) { nullValue = filter(context, nullValue); }
UChar32 c = start;
uint32_t value;
uint32_t trieValue, value;
bool haveValue = false;
int32_t i = c >> UCPTRIE_SHIFT_3;
do {
if (flags[i] == ALL_SAME) {
uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue,
filter, context);
uint32_t trieValue2 = index[i];
if (haveValue) {
if (value2 != value) {
return c - 1;
if (trieValue2 != trieValue) {
if (filter == nullptr ||
maybeFilterValue(trieValue2, initialValue, nullValue,
filter, context) != value) {
return c - 1;
}
trieValue = trieValue2; // may or may not help
}
} else {
value = value2;
trieValue = trieValue2;
value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
if (pValue != nullptr) { *pValue = value; }
haveValue = true;
}
c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK;
} else /* MIXED */ {
int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK);
uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue,
filter, context);
uint32_t trieValue2 = data[di];
if (haveValue) {
if (value2 != value) {
return c - 1;
if (trieValue2 != trieValue) {
if (filter == nullptr ||
maybeFilterValue(trieValue2, initialValue, nullValue,
filter, context) != value) {
return c - 1;
}
trieValue = trieValue2; // may or may not help
}
} else {
value = value2;
trieValue = trieValue2;
value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
if (pValue != nullptr) { *pValue = value; }
haveValue = true;
}
while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) {
if (maybeFilterValue(data[++di], initialValue, nullValue,
filter, context) != value) {
return c - 1;
trieValue2 = data[++di];
if (trieValue2 != trieValue) {
if (filter == nullptr ||
maybeFilterValue(trieValue2, initialValue, nullValue,
filter, context) != value) {
return c - 1;
}
}
trieValue = trieValue2; // may or may not help
}
}
++i;
@ -548,28 +566,8 @@ void MutableCodePointTrie::maskValues(uint32_t mask) {
}
}
inline bool
equalBlocks(const uint32_t *s, const uint32_t *t, int32_t length) {
while (length > 0 && *s == *t) {
++s;
++t;
--length;
}
return length == 0;
}
inline bool
equalBlocks(const uint16_t *s, const uint32_t *t, int32_t length) {
while (length > 0 && *s == *t) {
++s;
++t;
--length;
}
return length == 0;
}
inline bool
equalBlocks(const uint16_t *s, const uint16_t *t, int32_t length) {
template<typename UIntA, typename UIntB>
bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) {
while (length > 0 && *s == *t) {
++s;
++t;
@ -585,36 +583,6 @@ bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) {
}
/** Search for an identical block. */
int32_t findSameBlock(const uint32_t *p, int32_t pStart, int32_t length,
const uint32_t *q, int32_t qStart, int32_t blockLength) {
// Ensure that we do not even partially get past length.
length -= blockLength;
q += qStart;
while (pStart <= length) {
if (equalBlocks(p + pStart, q, blockLength)) {
return pStart;
}
++pStart;
}
return -1;
}
int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
const uint32_t *q, int32_t qStart, int32_t blockLength) {
// Ensure that we do not even partially get past length.
length -= blockLength;
q += qStart;
while (pStart <= length) {
if (equalBlocks(p + pStart, q, blockLength)) {
return pStart;
}
++pStart;
}
return -1;
}
int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
const uint16_t *q, int32_t qStart, int32_t blockLength) {
// Ensure that we do not even partially get past length.
@ -655,30 +623,9 @@ int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit,
* Look for maximum overlap of the beginning of the other block
* with the previous, adjacent block.
*/
int32_t getOverlap(const uint32_t *p, int32_t length,
const uint32_t *q, int32_t qStart, int32_t blockLength) {
int32_t overlap = blockLength - 1;
U_ASSERT(overlap <= length);
q += qStart;
while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
--overlap;
}
return overlap;
}
int32_t getOverlap(const uint16_t *p, int32_t length,
const uint32_t *q, int32_t qStart, int32_t blockLength) {
int32_t overlap = blockLength - 1;
U_ASSERT(overlap <= length);
q += qStart;
while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
--overlap;
}
return overlap;
}
int32_t getOverlap(const uint16_t *p, int32_t length,
const uint16_t *q, int32_t qStart, int32_t blockLength) {
template<typename UIntA, typename UIntB>
int32_t getOverlap(const UIntA *p, int32_t length,
const UIntB *q, int32_t qStart, int32_t blockLength) {
int32_t overlap = blockLength - 1;
U_ASSERT(overlap <= length);
q += qStart;
@ -807,6 +754,171 @@ private:
int32_t refCounts[CAPACITY];
};
// Custom hash table for mixed-value blocks to be found anywhere in the
// compacted data or index so far.
class MixedBlocks {
public:
MixedBlocks() {}
~MixedBlocks() {
uprv_free(table);
}
bool init(int32_t maxLength, int32_t newBlockLength) {
// We store actual data indexes + 1 to reserve 0 for empty entries.
int32_t maxDataIndex = maxLength - newBlockLength + 1;
int32_t newLength;
if (maxDataIndex <= 0xfff) { // 4k
newLength = 6007;
shift = 12;
mask = 0xfff;
} else if (maxDataIndex <= 0x7fff) { // 32k
newLength = 50021;
shift = 15;
mask = 0x7fff;
} else if (maxDataIndex <= 0x1ffff) { // 128k
newLength = 200003;
shift = 17;
mask = 0x1ffff;
} else {
// maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
newLength = 1500007;
shift = 21;
mask = 0x1fffff;
}
if (newLength > capacity) {
uprv_free(table);
table = (uint32_t *)uprv_malloc(newLength * 4);
if (table == nullptr) {
return false;
}
capacity = newLength;
}
length = newLength;
uprv_memset(table, 0, length * 4);
blockLength = newBlockLength;
return true;
}
template<typename UInt>
void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) {
int32_t start = prevDataLength - blockLength;
if (start >= minStart) {
++start; // Skip the last block that we added last time.
} else {
start = minStart; // Begin with the first full block.
}
for (int32_t end = newDataLength - blockLength; start <= end; ++start) {
uint32_t hashCode = makeHashCode(data, start);
addEntry(data, start, hashCode, start);
}
}
template<typename UIntA, typename UIntB>
int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const {
uint32_t hashCode = makeHashCode(blockData, blockStart);
int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode);
if (entryIndex >= 0) {
return (table[entryIndex] & mask) - 1;
} else {
return -1;
}
}
int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const {
uint32_t hashCode = makeHashCode(blockValue);
int32_t entryIndex = findEntry(data, blockValue, hashCode);
if (entryIndex >= 0) {
return (table[entryIndex] & mask) - 1;
} else {
return -1;
}
}
private:
template<typename UInt>
uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const {
int32_t blockLimit = blockStart + blockLength;
uint32_t hashCode = blockData[blockStart++];
do {
hashCode = 37 * hashCode + blockData[blockStart++];
} while (blockStart < blockLimit);
return hashCode;
}
uint32_t makeHashCode(uint32_t blockValue) const {
uint32_t hashCode = blockValue;
for (int32_t i = 1; i < blockLength; ++i) {
hashCode = 37 * hashCode + blockValue;
}
return hashCode;
}
template<typename UInt>
void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) {
U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask);
int32_t entryIndex = findEntry(data, data, blockStart, hashCode);
if (entryIndex < 0) {
table[~entryIndex] = (hashCode << shift) | (dataIndex + 1);
}
}
template<typename UIntA, typename UIntB>
int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart,
uint32_t hashCode) const {
uint32_t shiftedHashCode = hashCode << shift;
int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
for (int32_t entryIndex = initialEntryIndex;;) {
uint32_t entry = table[entryIndex];
if (entry == 0) {
return ~entryIndex;
}
if ((entry & ~mask) == shiftedHashCode) {
int32_t dataIndex = (entry & mask) - 1;
if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) {
return entryIndex;
}
}
entryIndex = nextIndex(initialEntryIndex, entryIndex);
}
}
int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const {
uint32_t shiftedHashCode = hashCode << shift;
int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
for (int32_t entryIndex = initialEntryIndex;;) {
uint32_t entry = table[entryIndex];
if (entry == 0) {
return ~entryIndex;
}
if ((entry & ~mask) == shiftedHashCode) {
int32_t dataIndex = (entry & mask) - 1;
if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) {
return entryIndex;
}
}
entryIndex = nextIndex(initialEntryIndex, entryIndex);
}
}
inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const {
// U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
return (entryIndex + initialEntryIndex) % length;
}
// Hash table.
// The length is a prime number, larger than the maximum data length.
// The "shift" lower bits store a data index + 1.
// The remaining upper bits store a partial hashCode of the block data values.
uint32_t *table = nullptr;
int32_t capacity = 0;
int32_t length = 0;
int32_t shift = 0;
uint32_t mask = 0;
int32_t blockLength = 0;
};
int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) {
#ifdef UCPTRIE_DEBUG
bool overflow = false;
@ -962,8 +1074,9 @@ void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value,
*
* It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
*/
int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
uint32_t *newData, int32_t dataNullIndex) {
int32_t MutableCodePointTrie::compactData(
int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) {
#ifdef UCPTRIE_DEBUG
int32_t countSame=0, sumOverlaps=0;
bool printData = dataLength == 29088 /* line.brk */ ||
@ -983,8 +1096,14 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
#endif
}
int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
if (!mixedBlocks.init(newDataCapacity, blockLength)) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
mixedBlocks.extend(newData, 0, 0, newDataLength);
int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
int32_t fastLength = 0;
for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) {
@ -992,12 +1111,17 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
inc = 1;
fastLength = newDataLength;
if (!mixedBlocks.init(newDataCapacity, blockLength)) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
mixedBlocks.extend(newData, 0, 0, newDataLength);
}
if (flags[i] == ALL_SAME) {
uint32_t value = index[i];
int32_t n;
// Find an earlier part of the data array of length blockLength
// that is filled with this value.
int32_t n = mixedBlocks.findAllSameBlock(newData, value);
// If we find a match, and the current block is the data null block,
// and it is not a fast block but matches the start of a fast block,
// then we need to continue looking.
@ -1005,12 +1129,10 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
// and not all of the rest of the fast block is filled with this value.
// Otherwise trie.getRange() would detect that the fast block starts at
// dataNullOffset and assume incorrectly that it is filled with the null value.
for (int32_t start = 0;
(n = findAllSameBlock(newData, start, newDataLength,
value, blockLength)) >= 0 &&
i == dataNullIndex && i >= fastILimit && n < fastLength &&
isStartOfSomeFastBlock(n, index, fastILimit);
start = n + 1) {}
while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength &&
isStartOfSomeFastBlock(n, index, fastILimit)) {
n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength);
}
if (n >= 0) {
DEBUG_DO(++countSame);
index[i] = n;
@ -1023,14 +1145,16 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
}
#endif
index[i] = newDataLength - n;
int32_t prevDataLength = newDataLength;
while (n < blockLength) {
newData[newDataLength++] = value;
++n;
}
mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
}
} else if (flags[i] == MIXED) {
const uint32_t *block = data + index[i];
int32_t n = findSameBlock(newData, 0, newDataLength, block, 0, blockLength);
int32_t n = mixedBlocks.findBlock(newData, block, 0);
if (n >= 0) {
DEBUG_DO(++countSame);
index[i] = n;
@ -1043,9 +1167,11 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
}
#endif
index[i] = newDataLength - n;
int32_t prevDataLength = newDataLength;
while (n < blockLength) {
newData[newDataLength++] = block[n++];
}
mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
}
} else /* SAME_AS */ {
uint32_t j = index[i];
@ -1061,7 +1187,8 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
return newDataLength;
}
int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &errorCode) {
int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks,
UErrorCode &errorCode) {
int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3);
if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) {
// Only the linear fast index, no multi-stage index tables.
@ -1095,6 +1222,12 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
}
}
if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength);
// Examine index-3 blocks. For each determine one of:
// - same as the index-3 null block
// - same as a fast-index block
@ -1105,6 +1238,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
// Also determine an upper limit for the index-3 table length.
int32_t index3Capacity = 0;
i3FirstNull = index3NullOffset;
bool hasLongI3Blocks = false;
// If the fast index covers the whole BMP, then
// the multi-stage index is only for supplementary code points.
// Otherwise, the multi-stage index covers all of Unicode.
@ -1129,13 +1263,13 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
} else {
index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
hasLongI3Blocks = true;
}
i3FirstNull = 0;
}
} else {
if (oredI3 <= 0xffff) {
int32_t n = findSameBlock(fastIndex, 0, fastIndexLength,
index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
int32_t n = mixedBlocks.findBlock(fastIndex, index, i);
if (n >= 0) {
flags[i] = I3_BMP;
index[i] = n;
@ -1146,6 +1280,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
} else {
flags[i] = I3_18;
index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
hasLongI3Blocks = true;
}
}
i = j;
@ -1166,6 +1301,18 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
}
uprv_memcpy(index16, fastIndex, fastIndexLength * 2);
if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
MixedBlocks longI3Blocks;
if (hasLongI3Blocks) {
if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
}
// Compact the index-3 table and write an uncompacted version of the index-2 table.
uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity
int32_t i2Length = 0;
@ -1185,8 +1332,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
} else if (f == I3_BMP) {
i3 = index[i];
} else if (f == I3_16) {
int32_t n = findSameBlock(index16, index3Start, indexLength,
index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
int32_t n = mixedBlocks.findBlock(index16, index, i);
if (n >= 0) {
i3 = n;
} else {
@ -1198,12 +1344,18 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
}
i3 = indexLength - n;
int32_t prevIndexLength = indexLength;
while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) {
index16[indexLength++] = index[i + n++];
}
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
if (hasLongI3Blocks) {
longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
}
}
} else {
U_ASSERT(f == I3_18);
U_ASSERT(hasLongI3Blocks);
// Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
int32_t j = i;
int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
@ -1236,8 +1388,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
index16[k++] = v;
index16[k - 9] = upperBits;
} while (j < jLimit);
int32_t n = findSameBlock(index16, index3Start, indexLength,
index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
int32_t n = longI3Blocks.findBlock(index16, index16, indexLength);
if (n >= 0) {
i3 = n | 0x8000;
} else {
@ -1249,6 +1400,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
}
i3 = (indexLength - n) | 0x8000;
int32_t prevIndexLength = indexLength;
if (n > 0) {
int32_t start = indexLength;
while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
@ -1257,6 +1409,10 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
} else {
indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
}
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
if (hasLongI3Blocks) {
longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
}
}
}
if (index3NullOffset < 0 && i3FirstNull >= 0) {
@ -1279,16 +1435,23 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
}
// Compact the index-2 table and write the index-1 table.
static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH,
"must re-init mixedBlocks");
int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH;
int32_t i1 = fastIndexLength;
for (int32_t i = 0; i < i2Length; i += blockLength) {
if ((i2Length - i) < blockLength) {
int32_t n;
if ((i2Length - i) >= blockLength) {
// normal block
U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH);
n = mixedBlocks.findBlock(index16, index2, i);
} else {
// highStart is inside the last index-2 block. Shorten it.
blockLength = i2Length - i;
n = findSameBlock(index16, index3Start, indexLength,
index2, i, blockLength);
}
int32_t i2;
int32_t n = findSameBlock(index16, index3Start, indexLength,
index2, i, blockLength);
if (n >= 0) {
i2 = n;
} else {
@ -1299,9 +1462,11 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
n = getOverlap(index16, indexLength, index2, i, blockLength);
}
i2 = indexLength - n;
int32_t prevIndexLength = indexLength;
while (n < blockLength) {
index16[indexLength++] = index2[i + n++];
}
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
}
// Set the index-1 table entry.
index16[i1++] = i2;
@ -1369,7 +1534,11 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC
uprv_memcpy(newData, asciiData, sizeof(asciiData));
int32_t dataNullIndex = allSameBlocks.findMostUsed();
int32_t newDataLength = compactData(fastILimit, newData, dataNullIndex);
MixedBlocks mixedBlocks;
int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity,
dataNullIndex, mixedBlocks, errorCode);
if (U_FAILURE(errorCode)) { return 0; }
U_ASSERT(newDataLength <= newDataCapacity);
uprv_free(data);
data = newData;
@ -1394,7 +1563,7 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC
dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET;
}
int32_t indexLength = compactIndex(fastILimit, errorCode);
int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode);
highStart = realHighStart;
return indexLength;
}

View File

@ -27,7 +27,6 @@ U_NAMESPACE_BEGIN
// Forward Declarations.
class BMPSet;
class CharacterProperties;
class ParsePosition;
class RBBIRuleScanner;
class SymbolTable;
@ -276,14 +275,23 @@ class RuleCharacterIterator;
* @stable ICU 2.0
*/
class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
private:
/**
* Enough for sets with few ranges.
* For example, White_Space has 10 ranges, list length 21.
*/
static constexpr int32_t INITIAL_CAPACITY = 25;
// fFlags constant
static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
int32_t len; // length of list used; 0 <= len <= capacity
int32_t capacity; // capacity of list
UChar32* list; // MUST be terminated with HIGH
BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
UChar32* buffer; // internal buffer, may be NULL
int32_t bufferCapacity; // capacity of buffer
int32_t patLen;
UChar32* list = stackList; // MUST be terminated with HIGH
int32_t capacity = INITIAL_CAPACITY; // capacity of list
int32_t len = 1; // length of list used; 1 <= len <= capacity
uint8_t fFlags = 0; // Bit flag (see constants above)
BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.
UChar32* buffer = nullptr; // internal buffer, may be NULL
int32_t bufferCapacity = 0; // capacity of buffer
/**
* The pattern representation of this set. This may not be the
@ -294,15 +302,19 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
* indicating that toPattern() must generate a pattern
* representation from the inversion list.
*/
char16_t *pat;
UVector* strings; // maintained in sorted order
UnicodeSetStringSpan *stringSpan;
char16_t *pat = nullptr;
int32_t patLen = 0;
UVector* strings = nullptr; // maintained in sorted order
UnicodeSetStringSpan *stringSpan = nullptr;
/**
* Initial list array.
* Avoids some heap allocations, and list is never nullptr.
* Increases the object size a bit.
*/
UChar32 stackList[INITIAL_CAPACITY];
private:
enum { // constants
kIsBogus = 1 // This set is bogus (i.e. not valid)
};
uint8_t fFlags; // Bit flag (see constants above)
public:
/**
* Determine if this object contains a valid set.
@ -1480,8 +1492,6 @@ private:
friend class USetAccess;
int32_t getStringCount() const;
const UnicodeString* getString(int32_t index) const;
//----------------------------------------------------------------
@ -1528,13 +1538,18 @@ private:
// Implementation: Utility methods
//----------------------------------------------------------------
void ensureCapacity(int32_t newLen, UErrorCode& ec);
static int32_t nextCapacity(int32_t minCapacity);
void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
bool ensureCapacity(int32_t newLen);
bool ensureBufferCapacity(int32_t newLen);
void swapBuffers(void);
UBool allocateStrings(UErrorCode &status);
UBool hasStrings() const;
int32_t stringsSize() const;
UBool stringsContains(const UnicodeString &s) const;
UnicodeString& _toPattern(UnicodeString& result,
UBool escapeUnprintable) const;
@ -1614,7 +1629,6 @@ private:
UnicodeString& rebuiltPat,
UErrorCode& ec);
friend class CharacterProperties;
static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
/**
@ -1646,7 +1660,10 @@ private:
/**
* Set the new pattern to cache.
*/
void setPattern(const UnicodeString& newPat);
void setPattern(const UnicodeString& newPat) {
setPattern(newPat.getBuffer(), newPat.length());
}
void setPattern(const char16_t *newPat, int32_t newPatLen);
/**
* Release existing cached pattern.
*/

View File

@ -14,6 +14,7 @@
#include "unicode/parsepos.h"
#include "unicode/symtable.h"
#include "unicode/uniset.h"
#include "unicode/ustring.h"
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "ruleiter.h"
@ -53,11 +54,8 @@
// LOW <= all valid values. ZERO for codepoints
#define UNICODESET_LOW 0x000000
// initial storage. Must be >= 0
#define START_EXTRA 16
// extra amount for growth. Must be >= 0
#define GROW_EXTRA START_EXTRA
/** Max list [0, 1, 2, ..., max code point, HIGH] */
constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
U_NAMESPACE_BEGIN
@ -137,6 +135,18 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
return a.compare(b);
}
UBool UnicodeSet::hasStrings() const {
return strings != nullptr && !strings->isEmpty();
}
int32_t UnicodeSet::stringsSize() const {
return strings == nullptr ? 0 : strings->size();
}
UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
return strings != nullptr && strings->contains((void*) &s);
}
//----------------------------------------------------------------
// Constructors &c
//----------------------------------------------------------------
@ -144,24 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
/**
* Constructs an empty set.
*/
UnicodeSet::UnicodeSet() :
len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
fFlags(0)
{
UErrorCode status = U_ZERO_ERROR;
allocateStrings(status);
if (U_FAILURE(status)) {
setToBogus(); // If memory allocation failed, set to bogus state.
return;
}
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(list!=NULL){
list[0] = UNICODESET_HIGH;
} else { // If memory allocation failed, set to bogus state.
setToBogus();
return;
}
UnicodeSet::UnicodeSet() {
list[0] = UNICODESET_HIGH;
_dbgct(this);
}
@ -172,89 +166,39 @@ UnicodeSet::UnicodeSet() :
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
*/
UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
fFlags(0)
{
UErrorCode status = U_ZERO_ERROR;
allocateStrings(status);
if (U_FAILURE(status)) {
setToBogus(); // If memory allocation failed, set to bogus state.
return;
}
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(list!=NULL){
list[0] = UNICODESET_HIGH;
complement(start, end);
} else { // If memory allocation failed, set to bogus state.
setToBogus();
return;
}
UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
list[0] = UNICODESET_HIGH;
add(start, end);
_dbgct(this);
}
/**
* Constructs a set that is identical to the given UnicodeSet.
*/
UnicodeSet::UnicodeSet(const UnicodeSet& o) :
UnicodeFilter(o),
len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
bmpSet(0),
buffer(0), bufferCapacity(0),
patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
fFlags(0)
{
UErrorCode status = U_ZERO_ERROR;
allocateStrings(status);
if (U_FAILURE(status)) {
setToBogus(); // If memory allocation failed, set to bogus state.
return;
}
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(list!=NULL){
*this = o;
} else { // If memory allocation failed, set to bogus state.
setToBogus();
return;
}
UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
*this = o;
_dbgct(this);
}
// Copy-construct as thawed.
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
UnicodeFilter(o),
len(0), capacity(o.len + GROW_EXTRA), list(0),
bmpSet(0),
buffer(0), bufferCapacity(0),
patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
fFlags(0)
{
UErrorCode status = U_ZERO_ERROR;
allocateStrings(status);
if (U_FAILURE(status)) {
setToBogus(); // If memory allocation failed, set to bogus state.
return;
}
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(list!=NULL){
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
if (ensureCapacity(o.len)) {
// *this = o except for bmpSet and stringSpan
len = o.len;
uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
if (strings != NULL && o.strings != NULL) {
strings->assign(*o.strings, cloneUnicodeString, status);
} else { // Invalid strings.
setToBogus();
return;
if (o.hasStrings()) {
UErrorCode status = U_ZERO_ERROR;
if (!allocateStrings(status) ||
(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
setToBogus();
return;
}
}
if (o.pat) {
setPattern(UnicodeString(o.pat, o.patLen));
setPattern(o.pat, o.patLen);
}
} else { // If memory allocation failed, set to bogus state.
setToBogus();
return;
_dbgct(this);
}
_dbgct(this);
}
/**
@ -262,9 +206,11 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
*/
UnicodeSet::~UnicodeSet() {
_dbgdt(this); // first!
uprv_free(list);
if (list != stackList) {
uprv_free(list);
}
delete bmpSet;
if (buffer) {
if (buffer != stackList) {
uprv_free(buffer);
}
delete strings;
@ -290,32 +236,30 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
setToBogus();
return *this;
}
UErrorCode ec = U_ZERO_ERROR;
ensureCapacity(o.len, ec);
if (U_FAILURE(ec)) {
if (!ensureCapacity(o.len)) {
// ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
return *this;
}
len = o.len;
uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
if (o.bmpSet == NULL || asThawed) {
bmpSet = NULL;
} else {
if (o.bmpSet != nullptr && !asThawed) {
bmpSet = new BMPSet(*o.bmpSet, list, len);
if (bmpSet == NULL) { // Check for memory allocation error.
setToBogus();
return *this;
}
}
if (strings != NULL && o.strings != NULL) {
strings->assign(*o.strings, cloneUnicodeString, ec);
} else { // Invalid strings.
setToBogus();
return *this;
if (o.hasStrings()) {
UErrorCode status = U_ZERO_ERROR;
if ((strings == nullptr && !allocateStrings(status)) ||
(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
setToBogus();
return *this;
}
} else if (hasStrings()) {
strings->removeAllElements();
}
if (o.stringSpan == NULL || asThawed) {
stringSpan = NULL;
} else {
if (o.stringSpan != nullptr && !asThawed) {
stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
if (stringSpan == NULL) { // Check for memory allocation error.
setToBogus();
@ -324,7 +268,7 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
}
releasePattern();
if (o.pat) {
setPattern(UnicodeString(o.pat, o.patLen));
setPattern(o.pat, o.patLen);
}
return *this;
}
@ -357,7 +301,8 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const {
for (int32_t i = 0; i < len; ++i) {
if (list[i] != o.list[i]) return FALSE;
}
if (*strings != *o.strings) return FALSE;
if (hasStrings() != o.hasStrings()) { return FALSE; }
if (hasStrings() && *strings != *o.strings) return FALSE;
return TRUE;
}
@ -393,7 +338,7 @@ int32_t UnicodeSet::size(void) const {
for (int32_t i = 0; i < count; ++i) {
n += getRangeEnd(i) - getRangeStart(i) + 1;
}
return n + strings->size();
return n + stringsSize();
}
/**
@ -402,7 +347,7 @@ int32_t UnicodeSet::size(void) const {
* @return <tt>true</tt> if this set contains no elements.
*/
UBool UnicodeSet::isEmpty(void) const {
return len == 1 && strings->size() == 0;
return len == 1 && !hasStrings();
}
/**
@ -502,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const {
if (s.length() == 0) return FALSE;
int32_t cp = getSingleCP(s);
if (cp < 0) {
return strings->contains((void*) &s);
return stringsContains(s);
} else {
return contains((UChar32) cp);
}
@ -524,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
return FALSE;
}
}
if (!strings->containsAll(*c.strings)) return FALSE;
return TRUE;
return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
}
/**
@ -571,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
return FALSE;
}
}
if (!strings->containsNone(*c.strings)) return FALSE;
return TRUE;
return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
}
/**
@ -613,7 +556,7 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
return TRUE;
}
}
if (strings->size() != 0) {
if (hasStrings()) {
for (i=0; i<strings->size(); ++i) {
const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
//if (s.length() == 0) {
@ -648,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
return U_MISMATCH;
}
} else {
if (strings->size() != 0) { // try strings first
if (hasStrings()) { // try strings first
// might separate forward and backward loops later
// for now they are combined
@ -849,7 +792,39 @@ UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
*/
UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
if (pinCodePoint(start) < pinCodePoint(end)) {
UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
UChar32 limit = end + 1;
// Fast path for adding a new range after the last one.
// Odd list length: [..., lastStart, lastLimit, HIGH]
if ((len & 1) != 0) {
// If the list is empty, set lastLimit low enough to not be adjacent to 0.
UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
if (lastLimit <= start && !isFrozen() && !isBogus()) {
if (lastLimit == start) {
// Extend the last range.
list[len - 2] = limit;
if (limit == UNICODESET_HIGH) {
--len;
}
} else {
list[len - 1] = start;
if (limit < UNICODESET_HIGH) {
if (ensureCapacity(len + 2)) {
list[len++] = limit;
list[len++] = UNICODESET_HIGH;
}
} else { // limit == UNICODESET_HIGH
if (ensureCapacity(len + 1)) {
list[len++] = UNICODESET_HIGH;
}
}
}
releasePattern();
return *this;
}
}
// This is slow. Could be much faster using findCodePoint(start)
// and modifying the list, dealing with adjacent & overlapping ranges.
UChar32 range[3] = { start, limit, UNICODESET_HIGH };
add(range, 2, 0);
} else if (start == end) {
add(start);
@ -918,9 +893,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
list[i] = c;
// if we touched the HIGH mark, then add a new one
if (c == (UNICODESET_HIGH - 1)) {
UErrorCode status = U_ZERO_ERROR;
ensureCapacity(len+1, status);
if (U_FAILURE(status)) {
if (!ensureCapacity(len+1)) {
// ensureCapacity will mark the object as Bogus if OOM failure happens.
return *this;
}
@ -964,21 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
// ^
// list[i]
UErrorCode status = U_ZERO_ERROR;
ensureCapacity(len+2, status);
if (U_FAILURE(status)) {
if (!ensureCapacity(len+2)) {
// ensureCapacity will mark the object as Bogus if OOM failure happens.
return *this;
}
//for (int32_t k=len-1; k>=i; --k) {
// list[k+2] = list[k];
//}
UChar32* src = list + len;
UChar32* dst = src + 2;
UChar32* srclimit = list + i;
while (src > srclimit) *(--dst) = *(--src);
UChar32 *p = list + i;
uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
list[i] = c;
list[i+1] = c+1;
len += 2;
@ -1014,7 +979,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (!strings->contains((void*) &s)) {
if (!stringsContains(s)) {
_add(s);
releasePattern();
}
@ -1033,12 +998,16 @@ void UnicodeSet::_add(const UnicodeString& s) {
if (isFrozen() || isBogus()) {
return;
}
UErrorCode ec = U_ZERO_ERROR;
if (strings == nullptr && !allocateStrings(ec)) {
setToBogus();
return;
}
UnicodeString* t = new UnicodeString(s);
if (t == NULL) { // Check for memory allocation error.
setToBogus();
return;
}
UErrorCode ec = U_ZERO_ERROR;
strings->sortedInsert(t, compareUnicodeString, ec);
if (U_FAILURE(ec)) {
setToBogus();
@ -1121,7 +1090,10 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
}
UnicodeSet& UnicodeSet::removeAllStrings() {
strings->removeAllElements();
if (!isFrozen() && hasStrings()) {
strings->removeAllElements();
releasePattern();
}
return *this;
}
@ -1217,8 +1189,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
strings->removeElement((void*) &s);
releasePattern();
if (strings != nullptr && strings->removeElement((void*) &s)) {
releasePattern();
}
} else {
remove((UChar32)cp, (UChar32)cp);
}
@ -1260,24 +1233,17 @@ UnicodeSet& UnicodeSet::complement(void) {
if (isFrozen() || isBogus()) {
return *this;
}
UErrorCode status = U_ZERO_ERROR;
if (list[0] == UNICODESET_LOW) {
ensureBufferCapacity(len-1, status);
if (U_FAILURE(status)) {
return *this;
}
uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
--len;
} else {
ensureBufferCapacity(len+1, status);
if (U_FAILURE(status)) {
if (!ensureCapacity(len+1)) {
return *this;
}
uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
buffer[0] = UNICODESET_LOW;
uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
list[0] = UNICODESET_LOW;
++len;
}
swapBuffers();
releasePattern();
return *this;
}
@ -1294,7 +1260,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
if (s.length() == 0 || isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (strings->contains((void*) &s)) {
if (stringsContains(s)) {
strings->removeElement((void*) &s);
} else {
_add(s);
@ -1325,7 +1291,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
if ( c.strings!=NULL ) {
for (int32_t i=0; i<c.strings->size(); ++i) {
const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
if (!strings->contains((void*) s)) {
if (!stringsContains(*s)) {
_add(*s);
}
}
@ -1347,7 +1313,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
return *this;
}
retain(c.list, c.len, 0);
strings->retainAll(*c.strings);
if (hasStrings()) {
if (!c.hasStrings()) {
strings->removeAllElements();
} else {
strings->retainAll(*c.strings);
}
}
return *this;
}
@ -1365,7 +1337,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
return *this;
}
retain(c.list, c.len, 2);
strings->removeAll(*c.strings);
if (hasStrings() && c.hasStrings()) {
strings->removeAll(*c.strings);
}
return *this;
}
@ -1383,10 +1357,12 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
}
exclusiveOr(c.list, c.len, 0);
for (int32_t i=0; i<c.strings->size(); ++i) {
void* e = c.strings->elementAt(i);
if (!strings->removeElement(e)) {
_add(*(const UnicodeString*)e);
if (c.strings != nullptr) {
for (int32_t i=0; i<c.strings->size(); ++i) {
void* e = c.strings->elementAt(i);
if (strings == nullptr || !strings->removeElement(e)) {
_add(*(const UnicodeString*)e);
}
}
}
return *this;
@ -1400,18 +1376,14 @@ UnicodeSet& UnicodeSet::clear(void) {
if (isFrozen()) {
return *this;
}
if (list != NULL) {
list[0] = UNICODESET_HIGH;
}
list[0] = UNICODESET_HIGH;
len = 1;
releasePattern();
if (strings != NULL) {
strings->removeAllElements();
}
if (list != NULL && strings != NULL) {
// Remove bogus
fFlags = 0;
}
// Remove bogus
fFlags = 0;
return *this;
}
@ -1445,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
return list[index*2 + 1] - 1;
}
int32_t UnicodeSet::getStringCount() const {
return strings->size();
}
const UnicodeString* UnicodeSet::getString(int32_t index) const {
return (const UnicodeString*) strings->elementAt(index);
}
@ -1462,22 +1430,32 @@ UnicodeSet& UnicodeSet::compact() {
return *this;
}
// Delete buffer first to defragment memory less.
if (buffer != NULL) {
if (buffer != stackList) {
uprv_free(buffer);
buffer = NULL;
bufferCapacity = 0;
}
if (len < capacity) {
// Make the capacity equal to len or 1.
// We don't want to realloc of 0 size.
int32_t newCapacity = len + (len == 0);
UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
if (list == stackList) {
// pass
} else if (len <= INITIAL_CAPACITY) {
uprv_memcpy(stackList, list, len * sizeof(UChar32));
uprv_free(list);
list = stackList;
capacity = INITIAL_CAPACITY;
} else if ((len + 7) < capacity) {
// If we have more than a little unused capacity, shrink it to len.
UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
if (temp) {
list = temp;
capacity = newCapacity;
capacity = len;
}
// else what the heck happened?! We allocated less memory!
// Oh well. We'll keep our original array.
}
if (strings != nullptr && strings->isEmpty()) {
delete strings;
strings = nullptr;
}
return *this;
}
@ -1488,10 +1466,8 @@ UnicodeSet& UnicodeSet::compact() {
/**
* Deserialize constructor.
*/
UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
: len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
fFlags(0) {
UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
UErrorCode &ec) {
if(U_FAILURE(ec)) {
setToBogus();
@ -1506,24 +1482,15 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se
return;
}
allocateStrings(ec);
if (U_FAILURE(ec)) {
setToBogus();
return;
}
// bmp?
int32_t headerSize = ((data[0]&0x8000)) ?2:1;
int32_t bmpLength = (headerSize==1)?data[0]:data[1];
len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
#ifdef DEBUG_SERIALIZE
printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
#endif
capacity = len+1;
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
if(!list || U_FAILURE(ec)) {
setToBogus();
if(!ensureCapacity(newLength + 1)) { // +1 for HIGH
return;
}
// copy bmp
@ -1535,15 +1502,18 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se
#endif
}
// copy smp
for(i=bmpLength;i<len;i++) {
for(i=bmpLength;i<newLength;i++) {
list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
#ifdef DEBUG_SERIALIZE
printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
#endif
}
// terminator
list[len++]=UNICODESET_HIGH;
U_ASSERT(i == newLength);
if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
list[i++] = UNICODESET_HIGH;
}
len = i;
}
@ -1664,33 +1634,65 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) {
return TRUE;
}
void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
if (newLen <= capacity) {
return;
int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
// Grow exponentially to reduce the frequency of allocations.
if (minCapacity < INITIAL_CAPACITY) {
return minCapacity + INITIAL_CAPACITY;
} else if (minCapacity <= 2500) {
return 5 * minCapacity;
} else {
int32_t newCapacity = 2 * minCapacity;
if (newCapacity > MAX_LENGTH) {
newCapacity = MAX_LENGTH;
}
return newCapacity;
}
UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
if (temp == NULL) {
ec = U_MEMORY_ALLOCATION_ERROR;
setToBogus(); // set the object to bogus state if an OOM failure occurred.
return;
}
list = temp;
capacity = newLen + GROW_EXTRA;
// else we keep the original contents on the memory failure.
}
void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
if (buffer != NULL && newLen <= bufferCapacity)
return;
UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
bool UnicodeSet::ensureCapacity(int32_t newLen) {
if (newLen > MAX_LENGTH) {
newLen = MAX_LENGTH;
}
if (newLen <= capacity) {
return true;
}
int32_t newCapacity = nextCapacity(newLen);
UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
if (temp == NULL) {
setToBogus(); // set the object to bogus state if an OOM failure occurred.
return false;
}
// Copy only the actual contents.
uprv_memcpy(temp, list, len * sizeof(UChar32));
if (list != stackList) {
uprv_free(list);
}
list = temp;
capacity = newCapacity;
return true;
}
bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
if (newLen > MAX_LENGTH) {
newLen = MAX_LENGTH;
}
if (newLen <= bufferCapacity) {
return true;
}
int32_t newCapacity = nextCapacity(newLen);
UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
if (temp == NULL) {
ec = U_MEMORY_ALLOCATION_ERROR;
setToBogus();
return;
return false;
}
// The buffer has no contents to be copied.
// It is always filled from scratch after this call.
if (buffer != stackList) {
uprv_free(buffer);
}
buffer = temp;
bufferCapacity = newLen + GROW_EXTRA;
// else we keep the original contents on the memory failure.
bufferCapacity = newCapacity;
return true;
}
/**
@ -1727,9 +1729,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola
if (isFrozen() || isBogus()) {
return;
}
UErrorCode status = U_ZERO_ERROR;
ensureBufferCapacity(len + otherLen, status);
if (U_FAILURE(status)) {
if (!ensureBufferCapacity(len + otherLen)) {
return;
}
@ -1777,9 +1777,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
if (isFrozen() || isBogus() || other==NULL) {
return;
}
UErrorCode status = U_ZERO_ERROR;
ensureBufferCapacity(len + otherLen, status);
if (U_FAILURE(status)) {
if (!ensureBufferCapacity(len + otherLen)) {
return;
}
@ -1890,9 +1888,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
if (isFrozen() || isBogus()) {
return;
}
UErrorCode status = U_ZERO_ERROR;
ensureBufferCapacity(len + otherLen, status);
if (U_FAILURE(status)) {
if (!ensureBufferCapacity(len + otherLen)) {
return;
}
@ -2138,12 +2134,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
}
}
for (int32_t i = 0; i<strings->size(); ++i) {
result.append(OPEN_BRACE);
_appendToPat(result,
*(const UnicodeString*) strings->elementAt(i),
escapeUnprintable);
result.append(CLOSE_BRACE);
if (strings != nullptr) {
for (int32_t i = 0; i<strings->size(); ++i) {
result.append(OPEN_BRACE);
_appendToPat(result,
*(const UnicodeString*) strings->elementAt(i),
escapeUnprintable);
result.append(CLOSE_BRACE);
}
}
return result.append(SET_CLOSE);
}
@ -2162,13 +2160,12 @@ void UnicodeSet::releasePattern() {
/**
* Set the new pattern to cache.
*/
void UnicodeSet::setPattern(const UnicodeString& newPat) {
void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
releasePattern();
int32_t newPatLen = newPat.length();
pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
if (pat) {
patLen = newPatLen;
newPat.extractBetween(0, patLen, pat);
u_memcpy(pat, newPat, patLen);
pat[patLen] = 0;
}
// else we don't care if malloc failed. This was just a nice cache.
@ -2177,30 +2174,15 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) {
UnicodeFunctor *UnicodeSet::freeze() {
if(!isFrozen() && !isBogus()) {
// Do most of what compact() does before freezing because
// compact() will not work when the set is frozen.
// Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
// Delete buffer first to defragment memory less.
if (buffer != NULL) {
uprv_free(buffer);
buffer = NULL;
}
if (capacity > (len + GROW_EXTRA)) {
// Make the capacity equal to len or 1.
// We don't want to realloc of 0 size.
capacity = len + (len == 0);
list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
if (list == NULL) { // Check for memory allocation error.
setToBogus();
return this;
}
}
compact();
// Optimize contains() and span() and similar functions.
if (!strings->isEmpty()) {
if (hasStrings()) {
stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
if (stringSpan == nullptr) {
setToBogus();
return this;
} else if (!stringSpan->needsStringSpanUTF16()) {
// All strings are irrelevant for span() etc. because
// all of each string's code points are contained in this set.
// Do not check needsStringSpanUTF8() because UTF-8 has at most as
@ -2233,7 +2215,7 @@ int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC
}
if(stringSpan!=NULL) {
return stringSpan->span(s, length, spanCondition);
} else if(!strings->isEmpty()) {
} else if(hasStrings()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
@ -2270,7 +2252,7 @@ int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s
}
if(stringSpan!=NULL) {
return stringSpan->spanBack(s, length, spanCondition);
} else if(!strings->isEmpty()) {
} else if(hasStrings()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
@ -2308,7 +2290,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp
}
if(stringSpan!=NULL) {
return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
} else if(!strings->isEmpty()) {
} else if(hasStrings()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
@ -2346,7 +2328,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio
}
if(stringSpan!=NULL) {
return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
} else if(!strings->isEmpty()) {
} else if(hasStrings()) {
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
UnicodeSetStringSpan::BACK_UTF8_CONTAINED;

View File

@ -31,10 +31,6 @@
#include "util.h"
#include "uvector.h"
// initial storage. Must be >= 0
// *** same as in uniset.cpp ! ***
#define START_EXTRA 16
U_NAMESPACE_BEGIN
// TODO memory debugging provided inside uniset.cpp
@ -49,42 +45,16 @@ U_NAMESPACE_BEGIN
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status) :
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
fFlags(0)
{
if(U_SUCCESS(status)){
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
/* test for NULL */
if(list == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}else{
allocateStrings(status);
applyPattern(pattern, options, symbols, status);
}
}
UErrorCode& status) {
applyPattern(pattern, options, symbols, status);
_dbgct(this);
}
UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status) :
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
fFlags(0)
{
if(U_SUCCESS(status)){
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
/* test for NULL */
if(list == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}else{
allocateStrings(status);
applyPattern(pattern, pos, options, symbols, status);
}
}
UErrorCode& status) {
applyPattern(pattern, pos, options, symbols, status);
_dbgct(this);
}
@ -199,7 +169,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
// start with input set to guarantee inclusion
// USET_CASE: remove strings because the strings will actually be reduced (folded);
// therefore, start with no strings and add only those needed
if (attribute & USET_CASE_INSENSITIVE) {
if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) {
foldSet.strings->removeAllElements();
}
@ -234,7 +204,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
}
}
}
if (strings != NULL && strings->size() > 0) {
if (hasStrings()) {
if (attribute & USET_CASE_INSENSITIVE) {
for (int32_t j=0; j<strings->size(); ++j) {
str = *(const UnicodeString *) strings->elementAt(j);

View File

@ -47,10 +47,6 @@
U_NAMESPACE_USE
// initial storage. Must be >= 0
// *** same as in uniset.cpp ! ***
#define START_EXTRA 16
// Define UChar constants using hex for EBCDIC compatibility
// Used #define to reduce private static exports and memory access time.
#define SET_OPEN ((UChar)0x005B) /*[*/
@ -185,21 +181,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
* @param pattern a string specifying what characters are in the set
*/
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
UErrorCode& status) :
len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
fFlags(0)
{
if(U_SUCCESS(status)){
list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
/* test for NULL */
if(list == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}else{
allocateStrings(status);
applyPattern(pattern, status);
}
}
UErrorCode& status) {
applyPattern(pattern, status);
_dbgct(this);
}
@ -713,6 +696,11 @@ static UBool numericValueFilter(UChar32 ch, void* context) {
return u_getNumericValue(ch) == *(double*)context;
}
static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
int32_t value = *(int32_t*)context;
return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
}
static UBool versionFilter(UChar32 ch, void* context) {
static const UVersionInfo none = { 0, 0, 0, 0 };
UVersionInfo v;
@ -721,6 +709,16 @@ static UBool versionFilter(UChar32 ch, void* context) {
return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
}
typedef struct {
UProperty prop;
int32_t value;
} IntPropertyContext;
static UBool intPropertyFilter(UChar32 ch, void* context) {
IntPropertyContext* c = (IntPropertyContext*)context;
return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
}
static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
return uscript_hasScript(ch, *(UScriptCode*)context);
}
@ -781,43 +779,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
namespace {
/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) {
uint32_t mask = *(const uint32_t *)context;
value = U_MASK(value) & mask;
if (value != 0) { value = 1; }
return value;
}
/** Maps one map value to 1, all others to 0. */
uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) {
uint32_t v = *(const uint32_t *)context;
return value == v ? 1 : 0;
}
} // namespace
void UnicodeSet::applyIntPropertyValue(const UCPMap *map,
UCPMapValueFilter *filter, const void *context,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return; }
clear();
UChar32 start = 0, end;
uint32_t value;
while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
filter, context, &value)) >= 0) {
if (value != 0) {
add(start, end);
}
start = end + 1;
}
if (isBogus()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
}
}
namespace {
static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
/* Note: we use ' ' in compiler code page */
int32_t j = 0;
@ -845,11 +806,10 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
UnicodeSet&
UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
if (U_FAILURE(ec)) { return *this; }
// All of the following check isFrozen() before modifying this set.
if (U_FAILURE(ec) || isFrozen()) { return *this; }
if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec);
applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec);
const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
} else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
UScriptCode script = (UScriptCode)value;
@ -866,14 +826,11 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec)
clear();
}
} else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
const UCPMap *map = u_getIntPropertyMap(prop, &ec);
applyIntPropertyValue(map, intValueFilter, &value, ec);
const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
IntPropertyContext c = {prop, value};
applyFilter(intPropertyFilter, &c, inclusions, ec);
} else {
// This code used to always call getInclusions(property source)
// which sets an error for an unsupported property.
ec = U_ILLEGAL_ARGUMENT_ERROR;
// Otherwise we would just clear() this set because
// getIntPropertyValue(c, prop) returns 0 for all code points.
}
return *this;
}

View File

@ -462,7 +462,6 @@ class UnicodeSet;
class CharacterProperties {
public:
CharacterProperties() = delete;
static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode);
static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
};

View File

@ -249,7 +249,7 @@ class USetAccess /* not : public UObject because all methods are static */ {
public:
/* Try to have the compiler inline these*/
inline static int32_t getStringCount(const UnicodeSet& set) {
return set.getStringCount();
return set.stringsSize();
}
inline static const UnicodeString* getString(const UnicodeSet& set,
int32_t i) {

View File

@ -116,7 +116,7 @@ void UnicodeSetIterator::reset() {
stringCount = 0;
} else {
endRange = set->getRangeCount() - 1;
stringCount = set->strings->size();
stringCount = set->stringsSize();
}
range = 0;
endElement = -1;

View File

@ -35,7 +35,7 @@
U_NAMESPACE_BEGIN
// The value of MAX_TIMEZONE_ID_LENGTH is 128, which is defined in DYNAMIC_TIME_ZONE_INFORMATION
// The max size of TimeZoneKeyName is 128, defined in DYNAMIC_TIME_ZONE_INFORMATION
#define MAX_TIMEZONE_ID_LENGTH 128
/**
@ -44,7 +44,7 @@ U_NAMESPACE_BEGIN
* Note: We use the Win32 API GetDynamicTimeZoneInformation to get the current time zone info.
* This API returns a non-localized time zone name, which we can then map to an ICU time zone name.
*/
U_CFUNC const char* U_EXPORT2
U_INTERNAL const char* U_EXPORT2
uprv_detectWindowsTimeZone()
{
UErrorCode status = U_ZERO_ERROR;
@ -79,7 +79,7 @@ uprv_detectWindowsTimeZone()
// convert from wchar_t* (UTF-16 on Windows) to char* (UTF-8).
u_strToUTF8(dynamicTZKeyName, UPRV_LENGTHOF(dynamicTZKeyName), nullptr,
reinterpret_cast<const UChar*>(dynamicTZI.TimeZoneKeyName), UPRV_LENGTHOF(dynamicTZI.TimeZoneKeyName), &status);
reinterpret_cast<const UChar*>(dynamicTZI.TimeZoneKeyName), -1, &status);
if (U_FAILURE(status)) {
return nullptr;

View File

@ -28,7 +28,7 @@ U_CDECL_BEGIN
typedef struct _TIME_ZONE_INFORMATION TIME_ZONE_INFORMATION;
U_CDECL_END
U_CFUNC const char* U_EXPORT2
U_INTERNAL const char* U_EXPORT2
uprv_detectWindowsTimeZone();
#endif /* U_PLATFORM_USES_ONLY_WIN32_API */

View File

@ -1,5 +1,5 @@
commit 6a8e28db3cbff837570f93881e6e4f7ff4d5fb25
commit 6e82c7c389888603f0de84ffe5c60f43f11ee844
Author: Yoshito Umaoka <yoshito_umaoka@us.ibm.com>
Date: Tue Oct 30 08:52:31 2018 -0400
Date: Wed Nov 7 19:23:35 2018 -0500
ICU-20245: tzdata2018g updates. Also added tzdata2018f release files missed previously.
ICU-20260 Fix CR/LF issue

View File

@ -87,6 +87,8 @@ for patch in \
bug-1172609-timezone-recreateDefault.diff \
bug-1198952-workaround-make-3.82-bug.diff \
bug-1504656-relativetimeformat-plural-other-fallback.diff \
bug-1513934-timezone-detection-win7-part1.diff \
bug-1513934-timezone-detection-win7-part2.diff \
; do
echo "Applying local patch $patch"
patch -d ${icu_dir}/../../ -p1 --no-backup-if-mismatch < ${icu_dir}/../icu-patches/$patch