fix conversion function bug in 'utf_helper' when utf8 characters do not end with '\0' and reconstruct ecmastring

Signed-off-by: zhaozhibo <zhaozhibo3@huawei.com>
This commit is contained in:
zhaozhibo 2022-01-13 12:02:17 +08:00
parent 9d9de75ab3
commit bd53e23a1a
8 changed files with 28 additions and 58 deletions

View File

@ -69,14 +69,14 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end)
while (*ptr < end) {
uint16_t c = **ptr;
size_t size = 1;
if (**ptr > INT8_MAX) {
if (c > INT8_MAX) {
size = 0;
uint16_t utf8Bit = INT8_MAX + 1; // equal 0b1000'0000
while (utf8Bit > 0 && (c & utf8Bit) == utf8Bit) {
++size;
utf8Bit >>= 1UL;
}
if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, 1, 0) <= 0) {
if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) {
return true;
}
}

View File

@ -35,7 +35,7 @@ bool IsValidUTF8(const std::vector<uint8_t> &data)
uint32_t length = data.size();
switch (length) {
case UtfLength::ONE:
if (data.at(0) > BIT_MASK_1) {
if (data.at(0) >= BIT_MASK_1) {
return false;
}
break;
@ -216,48 +216,14 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com
return {pair, UtfLength::FOUR};
}
size_t Utf8ToUtf16Size(const uint8_t *utf8)
size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
{
size_t res = 0;
while (*utf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8);
res += pair > 0xffff ? UtfLength::TWO : UtfLength::ONE; // NOLINT(readability-magic-numbers)
utf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
}
return res;
return utf::MUtf8ToUtf16Size(utf8, utf8Len);
}
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start)
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
size_t start)
{
ASSERT(utf16Out != nullptr);
size_t outPos = 0;
while (*utf8In != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8In);
auto [pHi, pLo] = utf::SplitUtf16Pair(pair);
utf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
if (start > 0) {
start -= nbytes;
continue;
}
if (pHi != 0) {
if (outPos >= utf16Len - 1) { // check for place for two uint16
break;
}
outPos++;
*utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
}
if (outPos >= utf16Len) {
break;
}
outPos++;
*utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
if (outPos >= utf16Len) {
break;
}
}
return outPos;
return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
}
} // namespace panda::ecmascript::base::utf_helper

View File

@ -69,9 +69,10 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_
std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false);
size_t Utf8ToUtf16Size(const uint8_t *utf8);
size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start);
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
size_t start);
static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
{

View File

@ -63,12 +63,12 @@ inline EcmaString *EcmaString::CreateFromUtf8(const uint8_t *utf8Data, uint32_t
UNREACHABLE();
}
} else {
auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data);
auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len);
string = AllocStringObject(utf16Len, false, vm);
ASSERT(string != nullptr);
[[maybe_unused]] auto len =
base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf16Len, 0);
base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len, 0);
ASSERT(len == utf16Len);
}

View File

@ -321,7 +321,7 @@ bool EcmaString::StringsAreEqualUtf8(const EcmaString *str1, const uint8_t *utf8
Span<const uint8_t> data2(utf8Data, utf8Len);
return EcmaString::StringsAreEquals(data1, data2);
}
return IsUtf8EqualsUtf16(utf8Data, str1->GetDataUtf16(), str1->GetLength());
return IsUtf8EqualsUtf16(utf8Data, utf8Len, str1->GetDataUtf16(), str1->GetLength());
}
/* static */
@ -331,7 +331,7 @@ bool EcmaString::StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *ut
if (str1->GetLength() != utf16Len) {
result = false;
} else if (!str1->IsUtf16()) {
result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), utf16Data, utf16Len);
result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), str1->GetLength(), utf16Data, utf16Len);
} else {
Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
Span<const uint16_t> data2(utf16Data, utf16Len);
@ -422,15 +422,16 @@ uint32_t EcmaString::ComputeHashcode() const
}
/* static */
uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress)
uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress)
{
uint32_t hash;
if (canBeCompress) {
hash = ComputeHashForUtf8(utf8Data);
} else {
auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data);
auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len);
CVector<uint16_t> tmpBuffer(utf16Len);
[[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf16Len, 0);
[[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len,
utf16Len, 0);
ASSERT(len == utf16Len);
hash = ComputeHashForData(tmpBuffer.data(), utf16Len);
}
@ -444,12 +445,13 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le
}
/* static */
bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len)
bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
uint32_t utf16Len)
{
// length is one more than compared utf16Data, don't need convert all utf8Data to utf16Data
uint32_t utf8ConvertLength = utf16Len + 1;
CVector<uint16_t> tmpBuffer(utf8ConvertLength);
auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8ConvertLength, 0);
auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, utf8ConvertLength, 0);
if (len != utf16Len) {
return false;
}

View File

@ -171,7 +171,7 @@ public:
}
return length;
}
return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, maxLength, start);
return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, maxLength, start);
}
// NOLINTNEXTLINE(modernize-avoid-c-arrays)
@ -245,7 +245,7 @@ public:
* Compares strings by bytes, It doesn't check canonical unicode equivalence.
*/
static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len);
static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress);
static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress);
static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length);
static void SetCompressedStringsEnabled(bool val)
@ -303,7 +303,8 @@ private:
* str1 should have the same length as utf16_data.
* Converts utf8Data to utf16 and compare it with given utf16_data.
*/
static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len);
static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
uint32_t utf16Len);
template<typename T>
/**

View File

@ -26,7 +26,7 @@ EcmaStringTable::EcmaStringTable(const EcmaVM *vm) : vm_(vm) {}
EcmaString *EcmaStringTable::GetString(const uint8_t *utf8Data, uint32_t utf8Len, bool canBeCompress) const
{
uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, canBeCompress);
uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, utf8Len, canBeCompress);
for (auto it = table_.find(hashCode); it != table_.end(); it++) {
auto foundedString = it->second;
if (EcmaString::StringsAreEqualUtf8(foundedString, utf8Data, utf8Len, canBeCompress)) {

View File

@ -1647,7 +1647,7 @@ HWTEST_F_L0(EcmaStringTest, ComputeHashcodeUtf8)
for (uint32_t i = 0; i < lengthEcmaStrU8; i++) {
hashExpect = hashExpect * 31 + arrayU8[i];
}
EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8), static_cast<int32_t>(hashExpect));
EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8, true), static_cast<int32_t>(hashExpect));
}
/*