!8597 rewrite IsUtf8EqualsUtf16 function

Merge pull request !8597 from 马昌友/master
2024-10-07 08:03:29 +00:00 · 2024-08-20 19:37:06 +00:00 · 2024-08-20 19:37:06 +00:00 · c06fef1949
commit c06fef1949
parent 5ea79840e2 e4cdc3f18e
7 changed files with 653 additions and 23 deletions
--- a/ecmascript/base/tests/utf_helper_test.cpp
+++ b/ecmascript/base/tests/utf_helper_test.cpp
@ -673,4 +673,353 @@ HWTEST_F_L0(UtfHelperTest, ConvertUtf8ToUnicodeChar)
    unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr13, UtfLength::FOUR);
    EXPECT_EQ(unicodeRes, invalidValue);
 }
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test single byte characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_001) {
+    std::string utf8 = "Hello";
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello"
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test includes Chinese characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_002) {
+    std::string utf8 = "你好，世界！";
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好，世界！"
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test empty string
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_003) {
+    std::string utf8 = "";
+    std::vector<uint16_t> expected_utf16 = {}; // empty
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test section conversion
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_004) {
+    std::string utf8 = "Hello, 你好";
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello, 你"
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test buffer length limit
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_005) {
+    std::string utf8 = "你好，世界！";
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好"
+    std::vector<uint16_t> utf16(2); // Limit buffer length
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test for incorrect UTF-8 data
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_006) {
+    std::string utf8 = "\xF0\x28\x8C\x28";
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0};
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test single byte UTF-8 characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_007) {
+    std::string utf8 = "ABC"; // All are single byte characters
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0}; // ASCII characters: A, B, C
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Testing Double Byte UTF-8 Characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_008) {
+    std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // Unicode .
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test three byte UTF-8 characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_009) {
+    std::string utf8 = "\xE2\x82\xAC"; // euro: €
+    std::vector<uint16_t> expected_utf16 = {0x0}; // Unicode .
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test four byte UTF-8 characters and proxy pairs
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_010) {
+    std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 😎
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // surrogates
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test UTF-8 data containing zero bytes
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_011) {
+    std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+    std::vector<uint16_t> utf16(15);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8Nul.data()), utf8Nul.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: Utf8ToUtf16Size
+* @tc.desc: Test continuous illegal sequences
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_012) {
+    std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence
+    std::vector<uint16_t> expected_utf16 = {0x0, 0x0};
+    std::vector<uint16_t> utf16(10);
+    size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test single byte characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_001) {
+    std::string utf8 = "Hello";
+    std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F}; // "Hello"
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test includes Chinese characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_002) {
+    std::string utf8 = "你好，世界！";
+    std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D, 0xFF0C, 0x4E16, 0x754C, 0xFF01}; // "你好，世界！"
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test empty string
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_003) {
+    std::string utf8 = "";
+    std::vector<uint16_t> expected_utf16 = {}; // Empty
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test section conversion
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_004) {
+    std::string utf8 = "Hello, 你好";
+    std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002C, 0x20, 0x4F60};
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), 10, utf16.size(), 0); // Only process the first 9 bytes
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test buffer length limit
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_005) {
+    std::string utf8 = "你好，世界！";
+    std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D}; // "你好"
+    std::vector<uint16_t> utf16(2); // Limit buffer length
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test for incorrect UTF-8 data
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_006) {
+    std::string utf8 = "\xF0\x28\x8C\x28";
+    std::vector<uint16_t> expected_utf16 = {}; // Expected empty output, handling erroneous data
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_NE(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test single byte UTF-8 characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_007) {
+    std::string utf8 = "ABC"; // All are single byte characters
+    std::vector<uint16_t> expected_utf16 = {0x0041, 0x0042, 0x0043}; // ASCII characters: A, B, C
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Testing Double Byte UTF-8 Characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_008) {
+    std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively
+    std::vector<uint16_t> expected_utf16 = {0x00A2, 0x00FC}; // Unicode .
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test three byte UTF-8 characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_009) {
+    std::string utf8 = "\xE2\x82\xAC"; // euro €
+    std::vector<uint16_t> expected_utf16 = {0x20AC}; // Unicode .
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test four byte UTF-8 characters and proxy pairs
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_010) {
+    std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 😎
+    std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test UTF-8 data containing zero bytes
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_011) {
+    std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes
+    std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F,
+        0x0000, 0x0057, 0x006F, 0x0072, 0x006C, 0x0064}; // Including NULL characters
+    std::vector<uint16_t> utf16(15);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8Nul.data()),
+        utf16.data(), utf8Nul.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_EQ(utf16, expected_utf16);
+}
+
+/*
+* @tc.name: ConvertRegionUtf8ToUtf16
+* @tc.desc: Test continuous illegal sequences
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_012) {
+    std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence
+    std::vector<uint16_t> expected_utf16 = {};
+    std::vector<uint16_t> utf16(10);
+    size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
+        utf16.data(), utf8.size(), utf16.size(), 0);
+    utf16.resize(converted);
+    EXPECT_NE(utf16, expected_utf16);
+}
 } // namespace panda:test
--- a/ecmascript/base/utf_helper.cpp
+++ b/ecmascript/base/utf_helper.cpp
@ -24,6 +24,7 @@ static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10
    ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)

 namespace panda::ecmascript::base::utf_helper {
+
 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
 {
    ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
@ -346,13 +347,118 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com

 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
 {
-    return utf::MUtf8ToUtf16Size(utf8, utf8Len);
+    size_t in_pos = 0;
+    size_t res = 0;
+    while (in_pos < utf8Len) {
+        uint8_t src = utf8[in_pos];
+        switch (src & 0xF0) {
+            case 0xF0: {
+                const uint8_t c2 = utf8[++in_pos];
+                const uint8_t c3 = utf8[++in_pos];
+                const uint8_t c4 = utf8[++in_pos];
+                uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
+                    ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
+                if (codePoint >= SURROGATE_RAIR_START) {
+                    res += CONST_2;
+                } else {
+                    res++;
+                }
+                in_pos++;
+                break;
+            }
+            case 0xE0: {
+                in_pos += CONST_3;
+                res++;
+                break;
+            }
+            case 0xD0:
+            case 0xC0: {
+                in_pos += CONST_2;
+                res++;
+                break;
+            }
+            default:
+                do {
+                    in_pos++;
+                    res++;
+                } while (in_pos < utf8Len && utf8[in_pos] < 0x80);
+                break;
+        }
+    }
+    return res;
+}
+
+size_t ConvertUtf8ToUtf16Int(const uint8_t data, size_t max_bytes)
+{
+    if ((data & MASK1) == 0 || max_bytes < CONST_4) {
+        return 1;
+    }
+    if ((data & MASK2) == 0) {
+        return CONST_2;
+    }
+    if ((data & MASK3) == 0) {
+        return CONST_3;
+    }
+    return CONST_4;
 }

 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
                                size_t start)
 {
-    return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
+    size_t in_pos = 0;
+    size_t out_pos = 0;
+
+    while (in_pos < utf8Len && start > 0) {
+        auto nbytes = ConvertUtf8ToUtf16Int(utf8In[in_pos], utf8Len - in_pos);
+        in_pos += nbytes;
+        start -= nbytes;
+    }
+
+    while (in_pos < utf8Len && out_pos < utf16Len) {
+        uint8_t src = utf8In[in_pos];
+        switch (src & 0xF0) {
+            case 0xF0: {
+                const uint8_t c2 = utf8In[++in_pos];
+                const uint8_t c3 = utf8In[++in_pos];
+                const uint8_t c4 = utf8In[++in_pos];
+                uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
+                    ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
+                if (codePoint >= SURROGATE_RAIR_START) {
+                    if (out_pos >= utf16Len - 1) {
+                        return out_pos - 1;
+                    }
+                    codePoint -= SURROGATE_RAIR_START;
+                    utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
+                    utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
+                } else {
+                    utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
+                }
+                in_pos++;
+                break;
+            }
+            case 0xE0: {
+                const uint8_t c2 = utf8In[++in_pos];
+                const uint8_t c3 = utf8In[++in_pos];
+                utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
+                    ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
+                in_pos++;
+                break;
+            }
+            case 0xD0:
+            case 0xC0: {
+                const uint8_t c2 = utf8In[++in_pos];
+                utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
+                in_pos++;
+                break;
+            }
+            default:
+                do {
+                    utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
+                } while (in_pos < utf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
+                break;
+        }
+    }
+    return out_pos;
 }

 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
--- a/ecmascript/base/utf_helper.h
+++ b/ecmascript/base/utf_helper.h
@ -23,6 +23,24 @@
 #include "ecmascript/common.h"

 namespace panda::ecmascript::base::utf_helper {
+
+static constexpr size_t CONST_2 = 2;
+static constexpr size_t CONST_3 = 3;
+static constexpr size_t CONST_4 = 4;
+static constexpr size_t MASK1 = 0x80;
+static constexpr size_t MASK2 = 0x20;
+static constexpr size_t MASK3 = 0x10;
+static constexpr size_t LOW_3BITS = 0x7;
+static constexpr size_t LOW_4BITS = 0xF;
+static constexpr size_t LOW_5BITS = 0x1F;
+static constexpr size_t LOW_6BITS = 0x3F;
+static constexpr size_t L_SURROGATE_START = 0xDC00;
+static constexpr size_t H_SURROGATE_START = 0xD800;
+static constexpr size_t SURROGATE_RAIR_START = 0x10000;
+static constexpr size_t OFFSET_18POS = 18;
+static constexpr size_t OFFSET_12POS = 12;
+static constexpr size_t OFFSET_10POS = 10;
+static constexpr size_t OFFSET_6POS = 6;
 static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
 static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
 static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
--- a/ecmascript/ecma_string.cpp
+++ b/ecmascript/ecma_string.cpp
@ -19,6 +19,18 @@

 namespace panda::ecmascript {

+constexpr size_t LOW_3BITS = 0x7;
+constexpr size_t LOW_4BITS = 0xF;
+constexpr size_t LOW_5BITS = 0x1F;
+constexpr size_t LOW_6BITS = 0x3F;
+constexpr size_t L_SURROGATE_START = 0xDC00;
+constexpr size_t H_SURROGATE_START = 0xD800;
+constexpr size_t SURROGATE_RAIR_START = 0x10000;
+constexpr size_t OFFSET_18POS = 18;
+constexpr size_t OFFSET_12POS = 12;
+constexpr size_t OFFSET_10POS = 10;
+constexpr size_t OFFSET_6POS = 6;
+
 EcmaString *EcmaString::Concat(const EcmaVM *vm,
    const JSHandle<EcmaString> &left, const JSHandle<EcmaString> &right, MemSpaceType type)
 {
@ -831,31 +843,66 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le
 }

 /* static */
-bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
-                                   uint32_t utf16Len)
+bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len,
+                                   const uint16_t *utf16Data, uint32_t utf16Len)
 {
-    size_t utf8Pos = 0;
-    size_t utf16Pos = 0;
-    while (utf8Pos < utf8Len) {
-        auto [pair, nbytes] = utf::ConvertMUtf8ToUtf16Pair(utf8Data, utf8Len - utf8Pos);
-        auto [pHigh, pLow] = utf::SplitUtf16Pair(pair);
-        utf8Data += nbytes;
-        utf8Pos += nbytes;
-        if (pHigh != 0) {
-            ASSERT(utf16Len > 0);
-            if (utf16Pos >= utf16Len - 1 || *utf16Data != pHigh) {
-                return false;
+    const uint8_t *utf8End = utf8Data + utf8Len;
+    const uint16_t *utf16End = utf16Data + utf16Len;
+    while (utf8Data < utf8End && utf16Data < utf16End) {
+        uint8_t src = *utf8Data;
+        switch (src & 0xF0) {
+            case 0xF0: {
+                const uint8_t c2 = *(++utf8Data);
+                const uint8_t c3 = *(++utf8Data);
+                const uint8_t c4 = *(++utf8Data);
+                uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
+                                     ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
+                if (codePoint >= SURROGATE_RAIR_START) {
+                    if (utf16Data >= utf16End - 1) {
+                        return false;
+                    }
+                    codePoint -= SURROGATE_RAIR_START;
+                    if (*utf16Data++ != static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START) ||
+                        *utf16Data++ != static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START)) {
+                            return false;
+                    }
+                } else {
+                    if (*utf16Data++ != static_cast<uint16_t>(codePoint)) {
+                        return false;
+                    }
+                }
+                utf8Data++;
+                break;
            }
-            ++utf16Pos;
-            ++utf16Data;
+            case 0xE0: {
+                const uint8_t c2 = *(++utf8Data);
+                const uint8_t c3 = *(++utf8Data);
+                if (*utf16Data++ != static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
+                    ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS))) {
+                    return false;
+                }
+                utf8Data++;
+                break;
+            }
+            case 0xD0:
+            case 0xC0: {
+                const uint8_t c2 = *(++utf8Data);
+                if (*utf16Data++ != static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS))) {
+                    return false;
+                }
+                utf8Data++;
+                break;
+            }
+            default:
+                do {
+                    if (*utf16Data++ != static_cast<uint16_t>(*utf8Data++)) {
+                        return false;
+                    }
+                } while (utf8Data < utf8End && utf16Data < utf16End && *utf8Data < 0x80);
+                break;
        }
-        if (utf16Pos >= utf16Len || *utf16Data != pLow) {
-            return false;
-        }
-        ++utf16Pos;
-        ++utf16Data;
    }
-    return true;
+    return utf8Data == utf8End && utf16Data == utf16End;
 }

 bool EcmaString::ToElementIndex(uint32_t *index)
--- a/ecmascript/ecma_string.h
+++ b/ecmascript/ecma_string.h
@ -34,6 +34,9 @@
 #include "unicode/locid.h"

 namespace panda {
+namespace test {
+    class EcmaStringEqualsTest;
+}
 namespace ecmascript {
 template<typename T>
 class JSHandle;
@ -105,6 +108,7 @@ private:
    friend class SlicedString;
    friend class FlatStringInfo;
    friend class NameDictionary;
+    friend class panda::test::EcmaStringEqualsTest;

    static constexpr int SMALL_STRING_SIZE = 128;

--- a/ecmascript/tests/BUILD.gn
+++ b/ecmascript/tests/BUILD.gn
@ -60,6 +60,7 @@ host_unittest_action("EcmaVm_002_Test") {
    # "ecma_string_test.cpp",
    "ecma_context_test.cpp",
    "ecma_string_accessor_test.cpp",
+    "ecma_string_equals_test.cpp",
  ]

  configs = [
--- a/ecmascript/tests/ecma_string_equals_test.cpp
+++ b/ecmascript/tests/ecma_string_equals_test.cpp
@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024 Huawei Device Co., Ltd.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ecmascript/ecma_string-inl.h"
+#include "ecmascript/object_factory.h"
+#include "ecmascript/tests/ecma_test_common.h"
+
+using namespace panda::ecmascript;
+
+namespace panda::test {
+class EcmaStringEqualsTest : public BaseTestWithScope<false> {
+    public:
+        static bool IsUtf8EqualsUtf16UT(const uint8_t *utf8Data, size_t utf8Len,
+                                        const uint16_t *utf16Data, uint32_t utf16Len)
+        {
+            return  EcmaString::IsUtf8EqualsUtf16(utf8Data, utf8Len, utf16Data, utf16Len);
+        }
+};
+
+/*
+* @tc.name: IsUtf8EqualsUtf16
+* @tc.desc: Test a function that compares whether an array of UTF8 characters
+* is equal to an array of UTF16 characters
+* @tc.type: FUNC
+*/
+HWTEST_F_L0(EcmaStringEqualsTest, IsUtf8EqualsUtf16)
+{
+    // Test case 1: ASCII characters
+    const uint8_t utf8_01[] = "hello";  // "hello" in ASCII is valid UTF-8
+    const uint16_t utf16_01[] = {'h', 'e', 'l', 'l', 'o'};
+    EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_01, 5, utf16_01, 5));
+
+    // Test case 2: 2-byte UTF-8 sequences
+    const uint8_t utf8_02[] = {0xC3, 0xA9, 0xC3, 0xA8}; // "éè" in UTF-8
+    const uint16_t utf16_02[] = {0x00E9, 0x00E8}; // "éè" in UTF-16
+    EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_02, 4, utf16_02, 2));
+
+    // Test case 3: 3-byte UTF-8 sequences
+    const uint8_t utf8_03[] = {0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87}; // "中文" in UTF-8
+    const uint16_t utf16_03[] = {0x4E2D, 0x6587}; // "中文" in UTF-16
+    EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_03, 6, utf16_03, 2));
+
+    // Test case 4: 4-byte UTF-8 sequences
+    const uint8_t utf8_04[] = {0xF0, 0x9F, 0x98, 0x81}; // 😁 in UTF-8
+    const uint16_t utf16_04[] = {0xD83D, 0xDE01}; // 😁 in UTF-16 (surrogate pair)
+    EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_04, 4, utf16_04, 2));
+
+    // Test case 5: UTF-16 edge cases (empty strings)
+    const uint8_t *utf8_05 = nullptr;
+    const uint16_t *utf16_05 = nullptr;
+    EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_05, 0, utf16_05, 0));
+
+    // Test case 6: UTF-8 shorter than UTF-16
+    const uint8_t utf8_06[] = "test"; // "test" in ASCII
+    const uint16_t utf16_06[] = {'t', 'e', 's', 't', '!', '!'};
+    EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_06, 4, utf16_06, 6));
+
+    // Test case 7: UTF-8 longer than UTF-16
+    const uint8_t utf8_07[] = {0xF0, 0x9F, 0x98, 0x81, 0xF0, 0x9F, 0x98, 0x81}; // 😁😁 in UTF-8
+    const uint16_t utf16_07[] = {0xD83D, 0xDE01}; // 😁 in UTF-16 (surrogate pair)
+    EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_07, 8, utf16_07, 2));
+
+    // Test case 8: Incomplete surrogate pair in UTF-16
+    const uint8_t utf8_08[] = {0xF0, 0x9F, 0x92, 0xA9}; // 😩 in UTF-8
+    const uint16_t utf16_08[] = {0xD83D}; // Missing low surrogate
+    EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_08, 4, utf16_08, 1));
+
+    // Test case 9: Truncated UTF-8 multi-byte character
+    const uint8_t utf8_09[] = {0xE3, 0x81}; // Truncated "あ" (Japanese 'a')
+    const uint16_t utf16_09[] = {0x3042}; // Full "あ"
+    EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_09, 2, utf16_09, 1));
+
+    // Test case 10: Longer UTF-8 sequence matching shorter UTF-16
+    const uint8_t utf8_10[] = {0xC2, 0xA3, 0xC2, 0xA3}; // "££" in UTF-8
+    const uint16_t utf16_10[] = {0x00A3}; // Single "£"
+    EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_10, 4, utf16_10, 1));
+
+    // Test case 11: Handling noncharacters in both UTF-8 and UTF-16
+    const uint8_t utf8_11[] = {0xEF, 0xBF, 0xBE}; // UTF-8 noncharacter U+FFFE
+    const uint16_t utf16_11[] = {0xFFFE}; // UTF-16 noncharacter
+    EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_11, 3, utf16_11, 1));
+
+    // Test case 12: Empty UTF-8 and non-empty UTF-16
+    const uint8_t *utf8_12 = nullptr; // Empty UTF-8
+    const uint16_t utf16_12[] = {0x0061}; // "a"
+    EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_12, 0, utf16_12, 1));
+
+    // Test case 13: Non-empty UTF-8 and empty UTF-16
+    const uint8_t utf8_13[] = {0x61}; // "a"
+    const uint16_t *utf16_13 = nullptr; // Empty UTF-16
+    EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_13, 1, utf16_13, 0));
+}
+}