add bound check for ConvertRegionUtf8ToUtf16

Issue: https://gitee.com/openharmony/arkcompiler_ets_runtime/issues/IAR8ZY Signed-off-by: ZhouGuangyuan <zhouguangyuan1@huawei.com> Change-Id: Ia74e4e7667af6dbc4b06afadcf1b407fca2a7dc9
2024-11-23 10:09:54 +00:00 · 2024-09-11 12:33:06 +08:00 · 2024-09-11 12:33:06 +08:00 · cb362d2f90
commit cb362d2f90
parent fa4a5954f7
7 changed files with 60 additions and 77 deletions
--- a/ecmascript/base/number_helper.cpp
+++ b/ecmascript/base/number_helper.cpp
@ -96,7 +96,7 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end)
                ++size;
                utf8Bit >>= 1UL;
            }
-            if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) {
+            if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1) <= 0) {
                return true;
            }
        }
--- a/ecmascript/base/tests/utf_helper_test.cpp
+++ b/ecmascript/base/tests/utf_helper_test.cpp
@ -557,7 +557,6 @@ HWTEST_F_L0(UtfHelperTest, DebuggerConvertRegionUtf16ToUtf8)
 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16)
 {
    size_t utf16Len = 100;
-    size_t start = 0;
    uint8_t utf8Value[10] = {
        0x7F, // 1-length UTF16 encoding
        0xDF, 0xBF, // 1-length UTF16 encoding
@ -565,25 +564,13 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16)
        0xF4, 0x8F, 0xBF, 0xBF}; // 2-length UTF16 encoding
    const uint8_t *utf8ValuePtr = utf8Value;
    uint16_t *utf16Out = (uint16_t*)malloc(utf16Len);
-    size_t outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
+    size_t outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len);
    // 1 + 1 + 1 + 2 = 5s
    EXPECT_EQ(outPos, 5U);
    // 1 + 2 = 3
-    start = 3;
-    outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
+    utf8ValuePtr = utf8Value + 3;
+    outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value) - 3, utf16Len);
    EXPECT_EQ(outPos, 3U);
-
-    // When "start" is in the middle of a symbol sequence
-    start = 2;
-    outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
-    EXPECT_EQ(outPos, 0U);
-    start = 4;
-    outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
-    EXPECT_EQ(outPos, 0U);
-    start = 7;
-    outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
-    EXPECT_EQ(outPos, 0U);
-    free(utf16Out);
 }

 /*
@ -852,7 +839,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_001) {
    std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F}; // "Hello"
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -867,7 +854,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_002) {
    std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D, 0xFF0C, 0x4E16, 0x754C, 0xFF01}; // "你好，世界！"
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -882,7 +869,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_003) {
    std::vector<uint16_t> expected_utf16 = {}; // Empty
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -897,7 +884,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_004) {
    std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002C, 0x20, 0x4F60};
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), 10, utf16.size(), 0); // Only process the first 9 bytes
+                                                utf16.data(), 10, utf16.size()); // Only process the first 9 bytes
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -912,7 +899,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_005) {
    std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D}; // "你好"
    std::vector<uint16_t> utf16(2); // Limit buffer length
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -927,7 +914,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_006) {
    std::vector<uint16_t> expected_utf16 = {}; // Expected empty output, handling erroneous data
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_NE(utf16, expected_utf16);
 }
@ -942,7 +929,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_007) {
    std::vector<uint16_t> expected_utf16 = {0x0041, 0x0042, 0x0043}; // ASCII characters: A, B, C
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -957,7 +944,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_008) {
    std::vector<uint16_t> expected_utf16 = {0x00A2, 0x00FC}; // Unicode .
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -972,7 +959,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_009) {
    std::vector<uint16_t> expected_utf16 = {0x20AC}; // Unicode .
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -987,7 +974,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_010) {
    std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -1003,7 +990,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_011) {
        0x0000, 0x0057, 0x006F, 0x0072, 0x006C, 0x0064}; // Including NULL characters
    std::vector<uint16_t> utf16(15);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8Nul.data()),
-        utf16.data(), utf8Nul.size(), utf16.size(), 0);
+                                                utf16.data(), utf8Nul.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(utf16, expected_utf16);
 }
@ -1018,7 +1005,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_012) {
    std::vector<uint16_t> expected_utf16 = {};
    std::vector<uint16_t> utf16(10);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_NE(utf16, expected_utf16);
 }
@ -1033,7 +1020,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_013) {
    std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
    std::vector<uint16_t> utf16(0);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(converted, 0);
 }
@ -1047,7 +1034,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_014) {
    std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
    std::vector<uint16_t> utf16(1);
    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
-        utf16.data(), utf8.size(), utf16.size(), 0);
+                                                utf16.data(), utf8.size(), utf16.size());
    utf16.resize(converted);
    EXPECT_EQ(converted, 0);
 }
--- a/ecmascript/base/utf_helper.cpp
+++ b/ecmascript/base/utf_helper.cpp
@ -350,11 +350,31 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com
    return {pair, UtfLength::FOUR};
 }

+// drop the tail bytes if the remain length can't fill the length it represents.
+static inline size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len)
+{
+    size_t trimSize = 0;
+    if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) {
+        // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one.
+        trimSize = 1;
+    }
+    if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) {
+        // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two.
+        trimSize = CONST_2;
+    }
+    if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) {
+        // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three.
+        trimSize = CONST_3;
+    }
+    return utf8Len - trimSize;
+}
+
 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
 {
+    size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len);
    size_t in_pos = 0;
    size_t res = 0;
-    while (in_pos < utf8Len) {
+    while (in_pos < safeUtf8Len) {
        uint8_t src = utf8[in_pos];
        switch (src & 0xF0) {
            case 0xF0: {
@ -386,40 +406,21 @@ size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
                do {
                    in_pos++;
                    res++;
-                } while (in_pos < utf8Len && utf8[in_pos] < 0x80);
+                } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80);
                break;
        }
    }
+    // The remain chars should be treated as single byte char.
+    res += utf8Len - in_pos;
    return res;
 }

-size_t ConvertUtf8ToUtf16Int(const uint8_t data, size_t max_bytes)
-{
-    if ((data & MASK1) == 0 || max_bytes < CONST_4) {
-        return 1;
-    }
-    if ((data & MASK2) == 0) {
-        return CONST_2;
-    }
-    if ((data & MASK3) == 0) {
-        return CONST_3;
-    }
-    return CONST_4;
-}
-
-size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
-                                size_t start)
+size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)
 {
+    size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len);
    size_t in_pos = 0;
    size_t out_pos = 0;
-
-    while (in_pos < utf8Len && start > 0) {
-        auto nbytes = ConvertUtf8ToUtf16Int(utf8In[in_pos], utf8Len - in_pos);
-        in_pos += nbytes;
-        start -= nbytes;
-    }
-
-    while (in_pos < utf8Len && out_pos < utf16Len) {
+    while (in_pos < safeUtf8Len && out_pos < utf16Len) {
        uint8_t src = utf8In[in_pos];
        switch (src & 0xF0) {
            case 0xF0: {
@ -460,10 +461,14 @@ size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_
            default:
                do {
                    utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
-                } while (in_pos < utf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
+                } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
                break;
        }
    }
+    // The remain chars should be treated as single byte char.
+    while (in_pos < utf8Len && out_pos < utf16Len) {
+        utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
+    }
    return out_pos;
 }

--- a/ecmascript/base/utf_helper.h
+++ b/ecmascript/base/utf_helper.h
@ -122,8 +122,7 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com

 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);

-size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
-                                size_t start);
+size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len);

 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len);

--- a/ecmascript/ecma_string-inl.h
+++ b/ecmascript/ecma_string-inl.h
@ -58,7 +58,7 @@ inline EcmaString *EcmaString::CreateFromUtf8(const EcmaVM *vm, const uint8_t *u
        ASSERT(string != nullptr);

        [[maybe_unused]] auto len =
-            base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len, 0);
+            base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len);
        ASSERT(len == utf16Len);
    }

--- a/ecmascript/ecma_string.cpp
+++ b/ecmascript/ecma_string.cpp
@ -821,7 +821,7 @@ uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len
        auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len);
        CVector<uint16_t> tmpBuffer(utf16Len);
        [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len,
-                                                                               utf16Len, 0);
+                                                                               utf16Len);
        ASSERT(len == utf16Len);
        uint32_t hash = ComputeHashForData(tmpBuffer.data(), utf16Len, 0);
        return MixHashcode(hash, NOT_INTEGER);
--- a/ecmascript/ecma_string.h
+++ b/ecmascript/ecma_string.h
@ -435,9 +435,9 @@ private:
        CVector<uint8_t> tmpBuf;
        const uint8_t *data = EcmaString::GetUtf8DataFlat(this, tmpBuf);
        if (length > bufLength) {
-            return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, bufLength, bufLength, 0);
+            return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, bufLength, bufLength);
        }
-        return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, length, bufLength, 0);
+        return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, length, bufLength);
    }

    // It allows user to copy into buffer even if maxLength < length
@ -510,31 +510,23 @@ private:

    inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) const
    {
-        return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength);
-    }
-
-    uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength) const
-    {
+        uint32_t length = GetLength();
        if (length > maxLength) {
            return 0;
        }
-        uint32_t len = GetLength();
-        if (start + length > len) {
-            return 0;
-        }
        if (IsUtf16()) {
            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
            CVector<uint16_t> tmpBuf;
-            const uint16_t *data = EcmaString::GetUtf16DataFlat(this, tmpBuf);
-            if (memcpy_s(buf, maxLength * sizeof(uint16_t), data + start, length * sizeof(uint16_t)) != EOK) {
+            const uint16_t *data = GetUtf16DataFlat(this, tmpBuf);
+            if (memcpy_s(buf, maxLength * sizeof(uint16_t), data, length * sizeof(uint16_t)) != EOK) {
                LOG_FULL(FATAL) << "memcpy_s failed";
                UNREACHABLE();
            }
            return length;
        }
        CVector<uint8_t> tmpBuf;
-        const uint8_t *data = EcmaString::GetUtf8DataFlat(this, tmpBuf);
-        return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, len, maxLength, start);
+        const uint8_t *data = GetUtf8DataFlat(this, tmpBuf);
+        return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, length, maxLength);
    }

    std::u16string ToU16String(uint32_t len = 0);