diff --git a/ecmascript/base/number_helper.cpp b/ecmascript/base/number_helper.cpp index a6c3410bab..140b475b02 100644 --- a/ecmascript/base/number_helper.cpp +++ b/ecmascript/base/number_helper.cpp @@ -69,14 +69,14 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end) while (*ptr < end) { uint16_t c = **ptr; size_t size = 1; - if (**ptr > INT8_MAX) { + if (c > INT8_MAX) { size = 0; uint16_t utf8Bit = INT8_MAX + 1; // equal 0b1000'0000 while (utf8Bit > 0 && (c & utf8Bit) == utf8Bit) { ++size; utf8Bit >>= 1UL; } - if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, 1, 0) <= 0) { + if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) { return true; } } diff --git a/ecmascript/base/utf_helper.cpp b/ecmascript/base/utf_helper.cpp index 5cb168e06f..88602385d1 100644 --- a/ecmascript/base/utf_helper.cpp +++ b/ecmascript/base/utf_helper.cpp @@ -35,7 +35,7 @@ bool IsValidUTF8(const std::vector &data) uint32_t length = data.size(); switch (length) { case UtfLength::ONE: - if (data.at(0) > BIT_MASK_1) { + if (data.at(0) >= BIT_MASK_1) { return false; } break; @@ -216,48 +216,14 @@ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com return {pair, UtfLength::FOUR}; } -size_t Utf8ToUtf16Size(const uint8_t *utf8) +size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len) { - size_t res = 0; - while (*utf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8); - res += pair > 0xffff ? UtfLength::TWO : UtfLength::ONE; // NOLINT(readability-magic-numbers) - utf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - } - return res; + return utf::MUtf8ToUtf16Size(utf8, utf8Len); } -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start) +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, + size_t start) { - ASSERT(utf16Out != nullptr); - size_t outPos = 0; - while (*utf8In != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8In); - auto [pHi, pLo] = utf::SplitUtf16Pair(pair); - - utf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if (start > 0) { - start -= nbytes; - continue; - } - - if (pHi != 0) { - if (outPos >= utf16Len - 1) { // check for place for two uint16 - break; - } - outPos++; - *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - } - if (outPos >= utf16Len) { - break; - } - outPos++; - *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if (outPos >= utf16Len) { - break; - } - } - - return outPos; + return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start); } } // namespace panda::ecmascript::base::utf_helper diff --git a/ecmascript/base/utf_helper.h b/ecmascript/base/utf_helper.h index 29abed4908..64df9d09ce 100644 --- a/ecmascript/base/utf_helper.h +++ b/ecmascript/base/utf_helper.h @@ -69,9 +69,10 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false); -size_t Utf8ToUtf16Size(const uint8_t *utf8); +size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len); -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start); +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, + size_t start); static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) { diff --git a/ecmascript/ecma_string-inl.h b/ecmascript/ecma_string-inl.h index 5cffa1c1b1..88e917a248 100644 --- a/ecmascript/ecma_string-inl.h +++ b/ecmascript/ecma_string-inl.h @@ -63,12 +63,12 @@ inline EcmaString *EcmaString::CreateFromUtf8(const uint8_t *utf8Data, uint32_t UNREACHABLE(); } } else { - auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data); + auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len); string = AllocStringObject(utf16Len, false, vm); ASSERT(string != nullptr); [[maybe_unused]] auto len = - base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf16Len, 0); + base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len, 0); ASSERT(len == utf16Len); } diff --git a/ecmascript/ecma_string.cpp b/ecmascript/ecma_string.cpp index 7e9fa5fbba..d3d6504c3a 100644 --- a/ecmascript/ecma_string.cpp +++ b/ecmascript/ecma_string.cpp @@ -321,7 +321,7 @@ bool EcmaString::StringsAreEqualUtf8(const EcmaString *str1, const uint8_t *utf8 Span data2(utf8Data, utf8Len); return EcmaString::StringsAreEquals(data1, data2); } - return IsUtf8EqualsUtf16(utf8Data, str1->GetDataUtf16(), str1->GetLength()); + return IsUtf8EqualsUtf16(utf8Data, utf8Len, str1->GetDataUtf16(), str1->GetLength()); } /* static */ @@ -331,7 +331,7 @@ bool EcmaString::StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *ut if (str1->GetLength() != utf16Len) { result = false; } else if (!str1->IsUtf16()) { - result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), utf16Data, utf16Len); + result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), str1->GetLength(), utf16Data, utf16Len); } else { Span data1(str1->GetDataUtf16(), str1->GetLength()); Span data2(utf16Data, utf16Len); @@ -422,15 +422,16 @@ uint32_t EcmaString::ComputeHashcode() const } /* static */ -uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress) +uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress) { uint32_t hash; if (canBeCompress) { hash = ComputeHashForUtf8(utf8Data); } else { - auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data); + auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len); CVector tmpBuffer(utf16Len); - [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf16Len, 0); + [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, + utf16Len, 0); ASSERT(len == utf16Len); hash = ComputeHashForData(tmpBuffer.data(), utf16Len); } @@ -444,12 +445,13 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le } /* static */ -bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len) +bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, + uint32_t utf16Len) { // length is one more than compared utf16Data, don't need convert all utf8Data to utf16Data uint32_t utf8ConvertLength = utf16Len + 1; CVector tmpBuffer(utf8ConvertLength); - auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8ConvertLength, 0); + auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, utf8ConvertLength, 0); if (len != utf16Len) { return false; } diff --git a/ecmascript/ecma_string.h b/ecmascript/ecma_string.h index 453b2d1250..e7f6d7ef14 100644 --- a/ecmascript/ecma_string.h +++ b/ecmascript/ecma_string.h @@ -171,7 +171,7 @@ public: } return length; } - return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, maxLength, start); + return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, maxLength, start); } // NOLINTNEXTLINE(modernize-avoid-c-arrays) @@ -245,7 +245,7 @@ public: * Compares strings by bytes, It doesn't check canonical unicode equivalence. */ static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len); - static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress); + static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress); static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); static void SetCompressedStringsEnabled(bool val) @@ -303,7 +303,8 @@ private: * str1 should have the same length as utf16_data. * Converts utf8Data to utf16 and compare it with given utf16_data. */ - static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len); + static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, + uint32_t utf16Len); template /** diff --git a/ecmascript/ecma_string_table.cpp b/ecmascript/ecma_string_table.cpp index e74381da57..33629dad26 100644 --- a/ecmascript/ecma_string_table.cpp +++ b/ecmascript/ecma_string_table.cpp @@ -26,7 +26,7 @@ EcmaStringTable::EcmaStringTable(const EcmaVM *vm) : vm_(vm) {} EcmaString *EcmaStringTable::GetString(const uint8_t *utf8Data, uint32_t utf8Len, bool canBeCompress) const { - uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, canBeCompress); + uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, utf8Len, canBeCompress); for (auto it = table_.find(hashCode); it != table_.end(); it++) { auto foundedString = it->second; if (EcmaString::StringsAreEqualUtf8(foundedString, utf8Data, utf8Len, canBeCompress)) { diff --git a/ecmascript/tests/ecma_string_test.cpp b/ecmascript/tests/ecma_string_test.cpp index f3e39b5b2e..4bb105911d 100644 --- a/ecmascript/tests/ecma_string_test.cpp +++ b/ecmascript/tests/ecma_string_test.cpp @@ -1647,7 +1647,7 @@ HWTEST_F_L0(EcmaStringTest, ComputeHashcodeUtf8) for (uint32_t i = 0; i < lengthEcmaStrU8; i++) { hashExpect = hashExpect * 31 + arrayU8[i]; } - EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8), static_cast(hashExpect)); + EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8, true), static_cast(hashExpect)); } /*