mirror of
https://gitee.com/openharmony/arkcompiler_ets_runtime
synced 2024-10-07 08:03:29 +00:00
!8597 rewrite IsUtf8EqualsUtf16 function
Merge pull request !8597 from 马昌友/master
This commit is contained in:
commit
c06fef1949
@ -673,4 +673,353 @@ HWTEST_F_L0(UtfHelperTest, ConvertUtf8ToUnicodeChar)
|
||||
unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr13, UtfLength::FOUR);
|
||||
EXPECT_EQ(unicodeRes, invalidValue);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test single byte characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_001) {
|
||||
std::string utf8 = "Hello";
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello"
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test includes Chinese characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_002) {
|
||||
std::string utf8 = "你好,世界!";
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好,世界!"
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test empty string
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_003) {
|
||||
std::string utf8 = "";
|
||||
std::vector<uint16_t> expected_utf16 = {}; // empty
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test section conversion
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_004) {
|
||||
std::string utf8 = "Hello, 你好";
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello, 你"
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test buffer length limit
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_005) {
|
||||
std::string utf8 = "你好,世界!";
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好"
|
||||
std::vector<uint16_t> utf16(2); // Limit buffer length
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test for incorrect UTF-8 data
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_006) {
|
||||
std::string utf8 = "\xF0\x28\x8C\x28";
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0};
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test single byte UTF-8 characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_007) {
|
||||
std::string utf8 = "ABC"; // All are single byte characters
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0}; // ASCII characters: A, B, C
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Testing Double Byte UTF-8 Characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_008) {
|
||||
std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // Unicode .
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test three byte UTF-8 characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_009) {
|
||||
std::string utf8 = "\xE2\x82\xAC"; // euro: €
|
||||
std::vector<uint16_t> expected_utf16 = {0x0}; // Unicode .
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test four byte UTF-8 characters and proxy pairs
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_010) {
|
||||
std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 😎
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // surrogates
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test UTF-8 data containing zero bytes
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_011) {
|
||||
std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
|
||||
std::vector<uint16_t> utf16(15);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8Nul.data()), utf8Nul.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: Utf8ToUtf16Size
|
||||
* @tc.desc: Test continuous illegal sequences
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_012) {
|
||||
std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence
|
||||
std::vector<uint16_t> expected_utf16 = {0x0, 0x0};
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test single byte characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_001) {
|
||||
std::string utf8 = "Hello";
|
||||
std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F}; // "Hello"
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test includes Chinese characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_002) {
|
||||
std::string utf8 = "你好,世界!";
|
||||
std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D, 0xFF0C, 0x4E16, 0x754C, 0xFF01}; // "你好,世界!"
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test empty string
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_003) {
|
||||
std::string utf8 = "";
|
||||
std::vector<uint16_t> expected_utf16 = {}; // Empty
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test section conversion
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_004) {
|
||||
std::string utf8 = "Hello, 你好";
|
||||
std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002C, 0x20, 0x4F60};
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), 10, utf16.size(), 0); // Only process the first 9 bytes
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test buffer length limit
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_005) {
|
||||
std::string utf8 = "你好,世界!";
|
||||
std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D}; // "你好"
|
||||
std::vector<uint16_t> utf16(2); // Limit buffer length
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test for incorrect UTF-8 data
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_006) {
|
||||
std::string utf8 = "\xF0\x28\x8C\x28";
|
||||
std::vector<uint16_t> expected_utf16 = {}; // Expected empty output, handling erroneous data
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_NE(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test single byte UTF-8 characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_007) {
|
||||
std::string utf8 = "ABC"; // All are single byte characters
|
||||
std::vector<uint16_t> expected_utf16 = {0x0041, 0x0042, 0x0043}; // ASCII characters: A, B, C
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Testing Double Byte UTF-8 Characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_008) {
|
||||
std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively
|
||||
std::vector<uint16_t> expected_utf16 = {0x00A2, 0x00FC}; // Unicode .
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test three byte UTF-8 characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_009) {
|
||||
std::string utf8 = "\xE2\x82\xAC"; // euro €
|
||||
std::vector<uint16_t> expected_utf16 = {0x20AC}; // Unicode .
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test four byte UTF-8 characters and proxy pairs
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_010) {
|
||||
std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 😎
|
||||
std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test UTF-8 data containing zero bytes
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_011) {
|
||||
std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes
|
||||
std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F,
|
||||
0x0000, 0x0057, 0x006F, 0x0072, 0x006C, 0x0064}; // Including NULL characters
|
||||
std::vector<uint16_t> utf16(15);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8Nul.data()),
|
||||
utf16.data(), utf8Nul.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_EQ(utf16, expected_utf16);
|
||||
}
|
||||
|
||||
/*
|
||||
* @tc.name: ConvertRegionUtf8ToUtf16
|
||||
* @tc.desc: Test continuous illegal sequences
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_012) {
|
||||
std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence
|
||||
std::vector<uint16_t> expected_utf16 = {};
|
||||
std::vector<uint16_t> utf16(10);
|
||||
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
|
||||
utf16.data(), utf8.size(), utf16.size(), 0);
|
||||
utf16.resize(converted);
|
||||
EXPECT_NE(utf16, expected_utf16);
|
||||
}
|
||||
} // namespace panda:test
|
||||
|
@ -24,6 +24,7 @@ static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10
|
||||
((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
|
||||
|
||||
namespace panda::ecmascript::base::utf_helper {
|
||||
|
||||
uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
|
||||
{
|
||||
ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
|
||||
@ -346,13 +347,118 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com
|
||||
|
||||
size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
|
||||
{
|
||||
return utf::MUtf8ToUtf16Size(utf8, utf8Len);
|
||||
size_t in_pos = 0;
|
||||
size_t res = 0;
|
||||
while (in_pos < utf8Len) {
|
||||
uint8_t src = utf8[in_pos];
|
||||
switch (src & 0xF0) {
|
||||
case 0xF0: {
|
||||
const uint8_t c2 = utf8[++in_pos];
|
||||
const uint8_t c3 = utf8[++in_pos];
|
||||
const uint8_t c4 = utf8[++in_pos];
|
||||
uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
|
||||
((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
|
||||
if (codePoint >= SURROGATE_RAIR_START) {
|
||||
res += CONST_2;
|
||||
} else {
|
||||
res++;
|
||||
}
|
||||
in_pos++;
|
||||
break;
|
||||
}
|
||||
case 0xE0: {
|
||||
in_pos += CONST_3;
|
||||
res++;
|
||||
break;
|
||||
}
|
||||
case 0xD0:
|
||||
case 0xC0: {
|
||||
in_pos += CONST_2;
|
||||
res++;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
do {
|
||||
in_pos++;
|
||||
res++;
|
||||
} while (in_pos < utf8Len && utf8[in_pos] < 0x80);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
size_t ConvertUtf8ToUtf16Int(const uint8_t data, size_t max_bytes)
|
||||
{
|
||||
if ((data & MASK1) == 0 || max_bytes < CONST_4) {
|
||||
return 1;
|
||||
}
|
||||
if ((data & MASK2) == 0) {
|
||||
return CONST_2;
|
||||
}
|
||||
if ((data & MASK3) == 0) {
|
||||
return CONST_3;
|
||||
}
|
||||
return CONST_4;
|
||||
}
|
||||
|
||||
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
|
||||
size_t start)
|
||||
{
|
||||
return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
|
||||
size_t in_pos = 0;
|
||||
size_t out_pos = 0;
|
||||
|
||||
while (in_pos < utf8Len && start > 0) {
|
||||
auto nbytes = ConvertUtf8ToUtf16Int(utf8In[in_pos], utf8Len - in_pos);
|
||||
in_pos += nbytes;
|
||||
start -= nbytes;
|
||||
}
|
||||
|
||||
while (in_pos < utf8Len && out_pos < utf16Len) {
|
||||
uint8_t src = utf8In[in_pos];
|
||||
switch (src & 0xF0) {
|
||||
case 0xF0: {
|
||||
const uint8_t c2 = utf8In[++in_pos];
|
||||
const uint8_t c3 = utf8In[++in_pos];
|
||||
const uint8_t c4 = utf8In[++in_pos];
|
||||
uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
|
||||
((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
|
||||
if (codePoint >= SURROGATE_RAIR_START) {
|
||||
if (out_pos >= utf16Len - 1) {
|
||||
return out_pos - 1;
|
||||
}
|
||||
codePoint -= SURROGATE_RAIR_START;
|
||||
utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
|
||||
utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
|
||||
} else {
|
||||
utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
|
||||
}
|
||||
in_pos++;
|
||||
break;
|
||||
}
|
||||
case 0xE0: {
|
||||
const uint8_t c2 = utf8In[++in_pos];
|
||||
const uint8_t c3 = utf8In[++in_pos];
|
||||
utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
|
||||
((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
|
||||
in_pos++;
|
||||
break;
|
||||
}
|
||||
case 0xD0:
|
||||
case 0xC0: {
|
||||
const uint8_t c2 = utf8In[++in_pos];
|
||||
utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
|
||||
in_pos++;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
do {
|
||||
utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
|
||||
} while (in_pos < utf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return out_pos;
|
||||
}
|
||||
|
||||
size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
|
||||
|
@ -23,6 +23,24 @@
|
||||
#include "ecmascript/common.h"
|
||||
|
||||
namespace panda::ecmascript::base::utf_helper {
|
||||
|
||||
static constexpr size_t CONST_2 = 2;
|
||||
static constexpr size_t CONST_3 = 3;
|
||||
static constexpr size_t CONST_4 = 4;
|
||||
static constexpr size_t MASK1 = 0x80;
|
||||
static constexpr size_t MASK2 = 0x20;
|
||||
static constexpr size_t MASK3 = 0x10;
|
||||
static constexpr size_t LOW_3BITS = 0x7;
|
||||
static constexpr size_t LOW_4BITS = 0xF;
|
||||
static constexpr size_t LOW_5BITS = 0x1F;
|
||||
static constexpr size_t LOW_6BITS = 0x3F;
|
||||
static constexpr size_t L_SURROGATE_START = 0xDC00;
|
||||
static constexpr size_t H_SURROGATE_START = 0xD800;
|
||||
static constexpr size_t SURROGATE_RAIR_START = 0x10000;
|
||||
static constexpr size_t OFFSET_18POS = 18;
|
||||
static constexpr size_t OFFSET_12POS = 12;
|
||||
static constexpr size_t OFFSET_10POS = 10;
|
||||
static constexpr size_t OFFSET_6POS = 6;
|
||||
static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
|
||||
static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
|
||||
static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
|
||||
|
@ -19,6 +19,18 @@
|
||||
|
||||
namespace panda::ecmascript {
|
||||
|
||||
constexpr size_t LOW_3BITS = 0x7;
|
||||
constexpr size_t LOW_4BITS = 0xF;
|
||||
constexpr size_t LOW_5BITS = 0x1F;
|
||||
constexpr size_t LOW_6BITS = 0x3F;
|
||||
constexpr size_t L_SURROGATE_START = 0xDC00;
|
||||
constexpr size_t H_SURROGATE_START = 0xD800;
|
||||
constexpr size_t SURROGATE_RAIR_START = 0x10000;
|
||||
constexpr size_t OFFSET_18POS = 18;
|
||||
constexpr size_t OFFSET_12POS = 12;
|
||||
constexpr size_t OFFSET_10POS = 10;
|
||||
constexpr size_t OFFSET_6POS = 6;
|
||||
|
||||
EcmaString *EcmaString::Concat(const EcmaVM *vm,
|
||||
const JSHandle<EcmaString> &left, const JSHandle<EcmaString> &right, MemSpaceType type)
|
||||
{
|
||||
@ -831,31 +843,66 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le
|
||||
}
|
||||
|
||||
/* static */
|
||||
bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
|
||||
uint32_t utf16Len)
|
||||
bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len,
|
||||
const uint16_t *utf16Data, uint32_t utf16Len)
|
||||
{
|
||||
size_t utf8Pos = 0;
|
||||
size_t utf16Pos = 0;
|
||||
while (utf8Pos < utf8Len) {
|
||||
auto [pair, nbytes] = utf::ConvertMUtf8ToUtf16Pair(utf8Data, utf8Len - utf8Pos);
|
||||
auto [pHigh, pLow] = utf::SplitUtf16Pair(pair);
|
||||
utf8Data += nbytes;
|
||||
utf8Pos += nbytes;
|
||||
if (pHigh != 0) {
|
||||
ASSERT(utf16Len > 0);
|
||||
if (utf16Pos >= utf16Len - 1 || *utf16Data != pHigh) {
|
||||
const uint8_t *utf8End = utf8Data + utf8Len;
|
||||
const uint16_t *utf16End = utf16Data + utf16Len;
|
||||
while (utf8Data < utf8End && utf16Data < utf16End) {
|
||||
uint8_t src = *utf8Data;
|
||||
switch (src & 0xF0) {
|
||||
case 0xF0: {
|
||||
const uint8_t c2 = *(++utf8Data);
|
||||
const uint8_t c3 = *(++utf8Data);
|
||||
const uint8_t c4 = *(++utf8Data);
|
||||
uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
|
||||
((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
|
||||
if (codePoint >= SURROGATE_RAIR_START) {
|
||||
if (utf16Data >= utf16End - 1) {
|
||||
return false;
|
||||
}
|
||||
++utf16Pos;
|
||||
++utf16Data;
|
||||
}
|
||||
if (utf16Pos >= utf16Len || *utf16Data != pLow) {
|
||||
codePoint -= SURROGATE_RAIR_START;
|
||||
if (*utf16Data++ != static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START) ||
|
||||
*utf16Data++ != static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START)) {
|
||||
return false;
|
||||
}
|
||||
++utf16Pos;
|
||||
++utf16Data;
|
||||
} else {
|
||||
if (*utf16Data++ != static_cast<uint16_t>(codePoint)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
utf8Data++;
|
||||
break;
|
||||
}
|
||||
case 0xE0: {
|
||||
const uint8_t c2 = *(++utf8Data);
|
||||
const uint8_t c3 = *(++utf8Data);
|
||||
if (*utf16Data++ != static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
|
||||
((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS))) {
|
||||
return false;
|
||||
}
|
||||
utf8Data++;
|
||||
break;
|
||||
}
|
||||
case 0xD0:
|
||||
case 0xC0: {
|
||||
const uint8_t c2 = *(++utf8Data);
|
||||
if (*utf16Data++ != static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS))) {
|
||||
return false;
|
||||
}
|
||||
utf8Data++;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
do {
|
||||
if (*utf16Data++ != static_cast<uint16_t>(*utf8Data++)) {
|
||||
return false;
|
||||
}
|
||||
} while (utf8Data < utf8End && utf16Data < utf16End && *utf8Data < 0x80);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return utf8Data == utf8End && utf16Data == utf16End;
|
||||
}
|
||||
|
||||
bool EcmaString::ToElementIndex(uint32_t *index)
|
||||
|
@ -34,6 +34,9 @@
|
||||
#include "unicode/locid.h"
|
||||
|
||||
namespace panda {
|
||||
namespace test {
|
||||
class EcmaStringEqualsTest;
|
||||
}
|
||||
namespace ecmascript {
|
||||
template<typename T>
|
||||
class JSHandle;
|
||||
@ -105,6 +108,7 @@ private:
|
||||
friend class SlicedString;
|
||||
friend class FlatStringInfo;
|
||||
friend class NameDictionary;
|
||||
friend class panda::test::EcmaStringEqualsTest;
|
||||
|
||||
static constexpr int SMALL_STRING_SIZE = 128;
|
||||
|
||||
|
@ -60,6 +60,7 @@ host_unittest_action("EcmaVm_002_Test") {
|
||||
# "ecma_string_test.cpp",
|
||||
"ecma_context_test.cpp",
|
||||
"ecma_string_accessor_test.cpp",
|
||||
"ecma_string_equals_test.cpp",
|
||||
]
|
||||
|
||||
configs = [
|
||||
|
105
ecmascript/tests/ecma_string_equals_test.cpp
Normal file
105
ecmascript/tests/ecma_string_equals_test.cpp
Normal file
@ -0,0 +1,105 @@
|
||||
/*
|
||||
* Copyright (c) 2024 Huawei Device Co., Ltd.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "ecmascript/ecma_string-inl.h"
|
||||
#include "ecmascript/object_factory.h"
|
||||
#include "ecmascript/tests/ecma_test_common.h"
|
||||
|
||||
using namespace panda::ecmascript;
|
||||
|
||||
namespace panda::test {
|
||||
class EcmaStringEqualsTest : public BaseTestWithScope<false> {
|
||||
public:
|
||||
static bool IsUtf8EqualsUtf16UT(const uint8_t *utf8Data, size_t utf8Len,
|
||||
const uint16_t *utf16Data, uint32_t utf16Len)
|
||||
{
|
||||
return EcmaString::IsUtf8EqualsUtf16(utf8Data, utf8Len, utf16Data, utf16Len);
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* @tc.name: IsUtf8EqualsUtf16
|
||||
* @tc.desc: Test a function that compares whether an array of UTF8 characters
|
||||
* is equal to an array of UTF16 characters
|
||||
* @tc.type: FUNC
|
||||
*/
|
||||
HWTEST_F_L0(EcmaStringEqualsTest, IsUtf8EqualsUtf16)
|
||||
{
|
||||
// Test case 1: ASCII characters
|
||||
const uint8_t utf8_01[] = "hello"; // "hello" in ASCII is valid UTF-8
|
||||
const uint16_t utf16_01[] = {'h', 'e', 'l', 'l', 'o'};
|
||||
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_01, 5, utf16_01, 5));
|
||||
|
||||
// Test case 2: 2-byte UTF-8 sequences
|
||||
const uint8_t utf8_02[] = {0xC3, 0xA9, 0xC3, 0xA8}; // "éè" in UTF-8
|
||||
const uint16_t utf16_02[] = {0x00E9, 0x00E8}; // "éè" in UTF-16
|
||||
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_02, 4, utf16_02, 2));
|
||||
|
||||
// Test case 3: 3-byte UTF-8 sequences
|
||||
const uint8_t utf8_03[] = {0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87}; // "中文" in UTF-8
|
||||
const uint16_t utf16_03[] = {0x4E2D, 0x6587}; // "中文" in UTF-16
|
||||
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_03, 6, utf16_03, 2));
|
||||
|
||||
// Test case 4: 4-byte UTF-8 sequences
|
||||
const uint8_t utf8_04[] = {0xF0, 0x9F, 0x98, 0x81}; // 😁 in UTF-8
|
||||
const uint16_t utf16_04[] = {0xD83D, 0xDE01}; // 😁 in UTF-16 (surrogate pair)
|
||||
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_04, 4, utf16_04, 2));
|
||||
|
||||
// Test case 5: UTF-16 edge cases (empty strings)
|
||||
const uint8_t *utf8_05 = nullptr;
|
||||
const uint16_t *utf16_05 = nullptr;
|
||||
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_05, 0, utf16_05, 0));
|
||||
|
||||
// Test case 6: UTF-8 shorter than UTF-16
|
||||
const uint8_t utf8_06[] = "test"; // "test" in ASCII
|
||||
const uint16_t utf16_06[] = {'t', 'e', 's', 't', '!', '!'};
|
||||
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_06, 4, utf16_06, 6));
|
||||
|
||||
// Test case 7: UTF-8 longer than UTF-16
|
||||
const uint8_t utf8_07[] = {0xF0, 0x9F, 0x98, 0x81, 0xF0, 0x9F, 0x98, 0x81}; // 😁😁 in UTF-8
|
||||
const uint16_t utf16_07[] = {0xD83D, 0xDE01}; // 😁 in UTF-16 (surrogate pair)
|
||||
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_07, 8, utf16_07, 2));
|
||||
|
||||
// Test case 8: Incomplete surrogate pair in UTF-16
|
||||
const uint8_t utf8_08[] = {0xF0, 0x9F, 0x92, 0xA9}; // 😩 in UTF-8
|
||||
const uint16_t utf16_08[] = {0xD83D}; // Missing low surrogate
|
||||
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_08, 4, utf16_08, 1));
|
||||
|
||||
// Test case 9: Truncated UTF-8 multi-byte character
|
||||
const uint8_t utf8_09[] = {0xE3, 0x81}; // Truncated "あ" (Japanese 'a')
|
||||
const uint16_t utf16_09[] = {0x3042}; // Full "あ"
|
||||
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_09, 2, utf16_09, 1));
|
||||
|
||||
// Test case 10: Longer UTF-8 sequence matching shorter UTF-16
|
||||
const uint8_t utf8_10[] = {0xC2, 0xA3, 0xC2, 0xA3}; // "££" in UTF-8
|
||||
const uint16_t utf16_10[] = {0x00A3}; // Single "£"
|
||||
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_10, 4, utf16_10, 1));
|
||||
|
||||
// Test case 11: Handling noncharacters in both UTF-8 and UTF-16
|
||||
const uint8_t utf8_11[] = {0xEF, 0xBF, 0xBE}; // UTF-8 noncharacter U+FFFE
|
||||
const uint16_t utf16_11[] = {0xFFFE}; // UTF-16 noncharacter
|
||||
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_11, 3, utf16_11, 1));
|
||||
|
||||
// Test case 12: Empty UTF-8 and non-empty UTF-16
|
||||
const uint8_t *utf8_12 = nullptr; // Empty UTF-8
|
||||
const uint16_t utf16_12[] = {0x0061}; // "a"
|
||||
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_12, 0, utf16_12, 1));
|
||||
|
||||
// Test case 13: Non-empty UTF-8 and empty UTF-16
|
||||
const uint8_t utf8_13[] = {0x61}; // "a"
|
||||
const uint16_t *utf16_13 = nullptr; // Empty UTF-16
|
||||
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_13, 1, utf16_13, 0));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user