fix conversion function bug in 'utf_helper' when utf8 characters do not end with '\0' and reconstruct ecmastring

Signed-off-by: zhaozhibo <zhaozhibo3@huawei.com>
2024-11-27 04:00:37 +00:00 · 2022-01-13 12:02:17 +08:00 · 2022-01-13 12:02:17 +08:00 · bd53e23a1a
commit bd53e23a1a
parent 9d9de75ab3
8 changed files with 28 additions and 58 deletions
--- a/ecmascript/base/number_helper.cpp
+++ b/ecmascript/base/number_helper.cpp
@ -69,14 +69,14 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end)
    while (*ptr < end) {
        uint16_t c = **ptr;
        size_t size = 1;
-        if (**ptr > INT8_MAX) {
+        if (c > INT8_MAX) {
            size = 0;
            uint16_t utf8Bit = INT8_MAX + 1;  // equal 0b1000'0000
            while (utf8Bit > 0 && (c & utf8Bit) == utf8Bit) {
                ++size;
                utf8Bit >>= 1UL;
            }
-            if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, 1, 0) <= 0) {
+            if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) {
                return true;
            }
        }
--- a/ecmascript/base/utf_helper.cpp
+++ b/ecmascript/base/utf_helper.cpp
@ -35,7 +35,7 @@ bool IsValidUTF8(const std::vector<uint8_t> &data)
    uint32_t length = data.size();
    switch (length) {
        case UtfLength::ONE:
-            if (data.at(0) > BIT_MASK_1) {
+            if (data.at(0) >= BIT_MASK_1) {
                return false;
            }
            break;
@ -216,48 +216,14 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com
    return {pair, UtfLength::FOUR};
 }

-size_t Utf8ToUtf16Size(const uint8_t *utf8)
+size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
 {
-    size_t res = 0;
-    while (*utf8 != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8);
-        res += pair > 0xffff ? UtfLength::TWO : UtfLength::ONE;  // NOLINT(readability-magic-numbers)
-        utf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    }
-    return res;
+    return utf::MUtf8ToUtf16Size(utf8, utf8Len);
 }

-size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start)
+size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
+                                size_t start)
 {
-    ASSERT(utf16Out != nullptr);
-    size_t outPos = 0;
-    while (*utf8In != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8In);
-        auto [pHi, pLo] = utf::SplitUtf16Pair(pair);
-
-        utf8In += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        if (start > 0) {
-            start -= nbytes;
-            continue;
-        }
-
-        if (pHi != 0) {
-            if (outPos >= utf16Len - 1) {  // check for place for two uint16
-                break;
-            }
-            outPos++;
-            *utf16Out++ = pHi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        }
-        if (outPos >= utf16Len) {
-            break;
-        }
-        outPos++;
-        *utf16Out++ = pLo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        if (outPos >= utf16Len) {
-            break;
-        }
-    }
-
-    return outPos;
+    return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
 }
 }  // namespace panda::ecmascript::base::utf_helper
--- a/ecmascript/base/utf_helper.h
+++ b/ecmascript/base/utf_helper.h
@ -69,9 +69,10 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_

 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false);

-size_t Utf8ToUtf16Size(const uint8_t *utf8);
+size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);

-size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start);
+size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
+                                size_t start);

 static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
 {
--- a/ecmascript/ecma_string-inl.h
+++ b/ecmascript/ecma_string-inl.h
@ -63,12 +63,12 @@ inline EcmaString *EcmaString::CreateFromUtf8(const uint8_t *utf8Data, uint32_t
            UNREACHABLE();
        }
    } else {
-        auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data);
+        auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len);
        string = AllocStringObject(utf16Len, false, vm);
        ASSERT(string != nullptr);

        [[maybe_unused]] auto len =
-            base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf16Len, 0);
+            base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len, 0);
        ASSERT(len == utf16Len);
    }

--- a/ecmascript/ecma_string.cpp
+++ b/ecmascript/ecma_string.cpp
@ -321,7 +321,7 @@ bool EcmaString::StringsAreEqualUtf8(const EcmaString *str1, const uint8_t *utf8
        Span<const uint8_t> data2(utf8Data, utf8Len);
        return EcmaString::StringsAreEquals(data1, data2);
    }
-    return IsUtf8EqualsUtf16(utf8Data, str1->GetDataUtf16(), str1->GetLength());
+    return IsUtf8EqualsUtf16(utf8Data, utf8Len, str1->GetDataUtf16(), str1->GetLength());
 }

 /* static */
@ -331,7 +331,7 @@ bool EcmaString::StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *ut
    if (str1->GetLength() != utf16Len) {
        result = false;
    } else if (!str1->IsUtf16()) {
-        result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), utf16Data, utf16Len);
+        result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), str1->GetLength(), utf16Data, utf16Len);
    } else {
        Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
        Span<const uint16_t> data2(utf16Data, utf16Len);
@ -422,15 +422,16 @@ uint32_t EcmaString::ComputeHashcode() const
 }

 /* static */
-uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress)
+uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress)
 {
    uint32_t hash;
    if (canBeCompress) {
        hash = ComputeHashForUtf8(utf8Data);
    } else {
-        auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data);
+        auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len);
        CVector<uint16_t> tmpBuffer(utf16Len);
-        [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf16Len, 0);
+        [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len,
+                                                                               utf16Len, 0);
        ASSERT(len == utf16Len);
        hash = ComputeHashForData(tmpBuffer.data(), utf16Len);
    }
@ -444,12 +445,13 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le
 }

 /* static */
-bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len)
+bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
+                                   uint32_t utf16Len)
 {
    // length is one more than compared utf16Data, don't need convert all utf8Data to utf16Data
    uint32_t utf8ConvertLength = utf16Len + 1;
    CVector<uint16_t> tmpBuffer(utf8ConvertLength);
-    auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8ConvertLength, 0);
+    auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, utf8ConvertLength, 0);
    if (len != utf16Len) {
        return false;
    }
--- a/ecmascript/ecma_string.h
+++ b/ecmascript/ecma_string.h
@ -171,7 +171,7 @@ public:
            }
            return length;
        }
-        return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, maxLength, start);
+        return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, maxLength, start);
    }

    // NOLINTNEXTLINE(modernize-avoid-c-arrays)
@ -245,7 +245,7 @@ public:
     * Compares strings by bytes, It doesn't check canonical unicode equivalence.
     */
    static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len);
-    static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress);
+    static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress);
    static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length);

    static void SetCompressedStringsEnabled(bool val)
@ -303,7 +303,8 @@ private:
     * str1 should have the same length as utf16_data.
     * Converts utf8Data to utf16 and compare it with given utf16_data.
     */
-    static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len);
+    static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
+                                  uint32_t utf16Len);

    template<typename T>
    /**
--- a/ecmascript/ecma_string_table.cpp
+++ b/ecmascript/ecma_string_table.cpp
@ -26,7 +26,7 @@ EcmaStringTable::EcmaStringTable(const EcmaVM *vm) : vm_(vm) {}

 EcmaString *EcmaStringTable::GetString(const uint8_t *utf8Data, uint32_t utf8Len, bool canBeCompress) const
 {
-    uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, canBeCompress);
+    uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, utf8Len, canBeCompress);
    for (auto it = table_.find(hashCode); it != table_.end(); it++) {
        auto foundedString = it->second;
        if (EcmaString::StringsAreEqualUtf8(foundedString, utf8Data, utf8Len, canBeCompress)) {
--- a/ecmascript/tests/ecma_string_test.cpp
+++ b/ecmascript/tests/ecma_string_test.cpp
@ -1647,7 +1647,7 @@ HWTEST_F_L0(EcmaStringTest, ComputeHashcodeUtf8)
    for (uint32_t i = 0; i < lengthEcmaStrU8; i++) {
        hashExpect = hashExpect * 31 + arrayU8[i];
    }
-    EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8), static_cast<int32_t>(hashExpect));
+    EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8, true), static_cast<int32_t>(hashExpect));
 }

 /*