!8597 rewrite IsUtf8EqualsUtf16 function

Merge pull request !8597 from 马昌友/master
This commit is contained in:
openharmony_ci 2024-08-20 19:37:06 +00:00 committed by Gitee
commit c06fef1949
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
7 changed files with 653 additions and 23 deletions

View File

@ -673,4 +673,353 @@ HWTEST_F_L0(UtfHelperTest, ConvertUtf8ToUnicodeChar)
unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr13, UtfLength::FOUR);
EXPECT_EQ(unicodeRes, invalidValue);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test single byte characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_001) {
std::string utf8 = "Hello";
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello"
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test includes Chinese characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_002) {
std::string utf8 = "你好,世界!";
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好,世界!"
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test empty string
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_003) {
std::string utf8 = "";
std::vector<uint16_t> expected_utf16 = {}; // empty
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test section conversion
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_004) {
std::string utf8 = "Hello, 你好";
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello, 你"
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test buffer length limit
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_005) {
std::string utf8 = "你好,世界!";
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好"
std::vector<uint16_t> utf16(2); // Limit buffer length
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test for incorrect UTF-8 data
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_006) {
std::string utf8 = "\xF0\x28\x8C\x28";
std::vector<uint16_t> expected_utf16 = {0x0, 0x0};
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test single byte UTF-8 characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_007) {
std::string utf8 = "ABC"; // All are single byte characters
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0}; // ASCII characters: A, B, C
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Testing Double Byte UTF-8 Characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_008) {
std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively
std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // Unicode .
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test three byte UTF-8 characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_009) {
std::string utf8 = "\xE2\x82\xAC"; // euro: €
std::vector<uint16_t> expected_utf16 = {0x0}; // Unicode .
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test four byte UTF-8 characters and proxy pairs
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_010) {
std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 😎
std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // surrogates
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test UTF-8 data containing zero bytes
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_011) {
std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes
std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
std::vector<uint16_t> utf16(15);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8Nul.data()), utf8Nul.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: Utf8ToUtf16Size
* @tc.desc: Test continuous illegal sequences
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_012) {
std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence
std::vector<uint16_t> expected_utf16 = {0x0, 0x0};
std::vector<uint16_t> utf16(10);
size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test single byte characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_001) {
std::string utf8 = "Hello";
std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F}; // "Hello"
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test includes Chinese characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_002) {
std::string utf8 = "你好,世界!";
std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D, 0xFF0C, 0x4E16, 0x754C, 0xFF01}; // "你好,世界!"
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test empty string
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_003) {
std::string utf8 = "";
std::vector<uint16_t> expected_utf16 = {}; // Empty
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test section conversion
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_004) {
std::string utf8 = "Hello, 你好";
std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002C, 0x20, 0x4F60};
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), 10, utf16.size(), 0); // Only process the first 9 bytes
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test buffer length limit
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_005) {
std::string utf8 = "你好,世界!";
std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D}; // "你好"
std::vector<uint16_t> utf16(2); // Limit buffer length
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test for incorrect UTF-8 data
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_006) {
std::string utf8 = "\xF0\x28\x8C\x28";
std::vector<uint16_t> expected_utf16 = {}; // Expected empty output, handling erroneous data
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_NE(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test single byte UTF-8 characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_007) {
std::string utf8 = "ABC"; // All are single byte characters
std::vector<uint16_t> expected_utf16 = {0x0041, 0x0042, 0x0043}; // ASCII characters: A, B, C
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Testing Double Byte UTF-8 Characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_008) {
std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively
std::vector<uint16_t> expected_utf16 = {0x00A2, 0x00FC}; // Unicode .
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test three byte UTF-8 characters
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_009) {
std::string utf8 = "\xE2\x82\xAC"; // euro €
std::vector<uint16_t> expected_utf16 = {0x20AC}; // Unicode .
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test four byte UTF-8 characters and proxy pairs
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_010) {
std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji 😎
std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test UTF-8 data containing zero bytes
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_011) {
std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes
std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F,
0x0000, 0x0057, 0x006F, 0x0072, 0x006C, 0x0064}; // Including NULL characters
std::vector<uint16_t> utf16(15);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8Nul.data()),
utf16.data(), utf8Nul.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
/*
* @tc.name: ConvertRegionUtf8ToUtf16
* @tc.desc: Test continuous illegal sequences
* @tc.type: FUNC
*/
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_012) {
std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence
std::vector<uint16_t> expected_utf16 = {};
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.resize(converted);
EXPECT_NE(utf16, expected_utf16);
}
} // namespace panda:test

View File

@ -24,6 +24,7 @@ static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10
((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
namespace panda::ecmascript::base::utf_helper {
uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
{
ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
@ -346,13 +347,118 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com
size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
{
return utf::MUtf8ToUtf16Size(utf8, utf8Len);
size_t in_pos = 0;
size_t res = 0;
while (in_pos < utf8Len) {
uint8_t src = utf8[in_pos];
switch (src & 0xF0) {
case 0xF0: {
const uint8_t c2 = utf8[++in_pos];
const uint8_t c3 = utf8[++in_pos];
const uint8_t c4 = utf8[++in_pos];
uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
if (codePoint >= SURROGATE_RAIR_START) {
res += CONST_2;
} else {
res++;
}
in_pos++;
break;
}
case 0xE0: {
in_pos += CONST_3;
res++;
break;
}
case 0xD0:
case 0xC0: {
in_pos += CONST_2;
res++;
break;
}
default:
do {
in_pos++;
res++;
} while (in_pos < utf8Len && utf8[in_pos] < 0x80);
break;
}
}
return res;
}
size_t ConvertUtf8ToUtf16Int(const uint8_t data, size_t max_bytes)
{
if ((data & MASK1) == 0 || max_bytes < CONST_4) {
return 1;
}
if ((data & MASK2) == 0) {
return CONST_2;
}
if ((data & MASK3) == 0) {
return CONST_3;
}
return CONST_4;
}
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
size_t start)
{
return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
size_t in_pos = 0;
size_t out_pos = 0;
while (in_pos < utf8Len && start > 0) {
auto nbytes = ConvertUtf8ToUtf16Int(utf8In[in_pos], utf8Len - in_pos);
in_pos += nbytes;
start -= nbytes;
}
while (in_pos < utf8Len && out_pos < utf16Len) {
uint8_t src = utf8In[in_pos];
switch (src & 0xF0) {
case 0xF0: {
const uint8_t c2 = utf8In[++in_pos];
const uint8_t c3 = utf8In[++in_pos];
const uint8_t c4 = utf8In[++in_pos];
uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
if (codePoint >= SURROGATE_RAIR_START) {
if (out_pos >= utf16Len - 1) {
return out_pos - 1;
}
codePoint -= SURROGATE_RAIR_START;
utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
} else {
utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
}
in_pos++;
break;
}
case 0xE0: {
const uint8_t c2 = utf8In[++in_pos];
const uint8_t c3 = utf8In[++in_pos];
utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
in_pos++;
break;
}
case 0xD0:
case 0xC0: {
const uint8_t c2 = utf8In[++in_pos];
utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
in_pos++;
break;
}
default:
do {
utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
} while (in_pos < utf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
break;
}
}
return out_pos;
}
size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)

View File

@ -23,6 +23,24 @@
#include "ecmascript/common.h"
namespace panda::ecmascript::base::utf_helper {
static constexpr size_t CONST_2 = 2;
static constexpr size_t CONST_3 = 3;
static constexpr size_t CONST_4 = 4;
static constexpr size_t MASK1 = 0x80;
static constexpr size_t MASK2 = 0x20;
static constexpr size_t MASK3 = 0x10;
static constexpr size_t LOW_3BITS = 0x7;
static constexpr size_t LOW_4BITS = 0xF;
static constexpr size_t LOW_5BITS = 0x1F;
static constexpr size_t LOW_6BITS = 0x3F;
static constexpr size_t L_SURROGATE_START = 0xDC00;
static constexpr size_t H_SURROGATE_START = 0xD800;
static constexpr size_t SURROGATE_RAIR_START = 0x10000;
static constexpr size_t OFFSET_18POS = 18;
static constexpr size_t OFFSET_12POS = 12;
static constexpr size_t OFFSET_10POS = 10;
static constexpr size_t OFFSET_6POS = 6;
static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;

View File

@ -19,6 +19,18 @@
namespace panda::ecmascript {
constexpr size_t LOW_3BITS = 0x7;
constexpr size_t LOW_4BITS = 0xF;
constexpr size_t LOW_5BITS = 0x1F;
constexpr size_t LOW_6BITS = 0x3F;
constexpr size_t L_SURROGATE_START = 0xDC00;
constexpr size_t H_SURROGATE_START = 0xD800;
constexpr size_t SURROGATE_RAIR_START = 0x10000;
constexpr size_t OFFSET_18POS = 18;
constexpr size_t OFFSET_12POS = 12;
constexpr size_t OFFSET_10POS = 10;
constexpr size_t OFFSET_6POS = 6;
EcmaString *EcmaString::Concat(const EcmaVM *vm,
const JSHandle<EcmaString> &left, const JSHandle<EcmaString> &right, MemSpaceType type)
{
@ -831,31 +843,66 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le
}
/* static */
bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
uint32_t utf16Len)
bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len,
const uint16_t *utf16Data, uint32_t utf16Len)
{
size_t utf8Pos = 0;
size_t utf16Pos = 0;
while (utf8Pos < utf8Len) {
auto [pair, nbytes] = utf::ConvertMUtf8ToUtf16Pair(utf8Data, utf8Len - utf8Pos);
auto [pHigh, pLow] = utf::SplitUtf16Pair(pair);
utf8Data += nbytes;
utf8Pos += nbytes;
if (pHigh != 0) {
ASSERT(utf16Len > 0);
if (utf16Pos >= utf16Len - 1 || *utf16Data != pHigh) {
return false;
const uint8_t *utf8End = utf8Data + utf8Len;
const uint16_t *utf16End = utf16Data + utf16Len;
while (utf8Data < utf8End && utf16Data < utf16End) {
uint8_t src = *utf8Data;
switch (src & 0xF0) {
case 0xF0: {
const uint8_t c2 = *(++utf8Data);
const uint8_t c3 = *(++utf8Data);
const uint8_t c4 = *(++utf8Data);
uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
if (codePoint >= SURROGATE_RAIR_START) {
if (utf16Data >= utf16End - 1) {
return false;
}
codePoint -= SURROGATE_RAIR_START;
if (*utf16Data++ != static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START) ||
*utf16Data++ != static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START)) {
return false;
}
} else {
if (*utf16Data++ != static_cast<uint16_t>(codePoint)) {
return false;
}
}
utf8Data++;
break;
}
++utf16Pos;
++utf16Data;
case 0xE0: {
const uint8_t c2 = *(++utf8Data);
const uint8_t c3 = *(++utf8Data);
if (*utf16Data++ != static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS))) {
return false;
}
utf8Data++;
break;
}
case 0xD0:
case 0xC0: {
const uint8_t c2 = *(++utf8Data);
if (*utf16Data++ != static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS))) {
return false;
}
utf8Data++;
break;
}
default:
do {
if (*utf16Data++ != static_cast<uint16_t>(*utf8Data++)) {
return false;
}
} while (utf8Data < utf8End && utf16Data < utf16End && *utf8Data < 0x80);
break;
}
if (utf16Pos >= utf16Len || *utf16Data != pLow) {
return false;
}
++utf16Pos;
++utf16Data;
}
return true;
return utf8Data == utf8End && utf16Data == utf16End;
}
bool EcmaString::ToElementIndex(uint32_t *index)

View File

@ -34,6 +34,9 @@
#include "unicode/locid.h"
namespace panda {
namespace test {
class EcmaStringEqualsTest;
}
namespace ecmascript {
template<typename T>
class JSHandle;
@ -105,6 +108,7 @@ private:
friend class SlicedString;
friend class FlatStringInfo;
friend class NameDictionary;
friend class panda::test::EcmaStringEqualsTest;
static constexpr int SMALL_STRING_SIZE = 128;

View File

@ -60,6 +60,7 @@ host_unittest_action("EcmaVm_002_Test") {
# "ecma_string_test.cpp",
"ecma_context_test.cpp",
"ecma_string_accessor_test.cpp",
"ecma_string_equals_test.cpp",
]
configs = [

View File

@ -0,0 +1,105 @@
/*
* Copyright (c) 2024 Huawei Device Co., Ltd.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ecmascript/ecma_string-inl.h"
#include "ecmascript/object_factory.h"
#include "ecmascript/tests/ecma_test_common.h"
using namespace panda::ecmascript;
namespace panda::test {
class EcmaStringEqualsTest : public BaseTestWithScope<false> {
public:
static bool IsUtf8EqualsUtf16UT(const uint8_t *utf8Data, size_t utf8Len,
const uint16_t *utf16Data, uint32_t utf16Len)
{
return EcmaString::IsUtf8EqualsUtf16(utf8Data, utf8Len, utf16Data, utf16Len);
}
};
/*
* @tc.name: IsUtf8EqualsUtf16
* @tc.desc: Test a function that compares whether an array of UTF8 characters
* is equal to an array of UTF16 characters
* @tc.type: FUNC
*/
HWTEST_F_L0(EcmaStringEqualsTest, IsUtf8EqualsUtf16)
{
// Test case 1: ASCII characters
const uint8_t utf8_01[] = "hello"; // "hello" in ASCII is valid UTF-8
const uint16_t utf16_01[] = {'h', 'e', 'l', 'l', 'o'};
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_01, 5, utf16_01, 5));
// Test case 2: 2-byte UTF-8 sequences
const uint8_t utf8_02[] = {0xC3, 0xA9, 0xC3, 0xA8}; // "éè" in UTF-8
const uint16_t utf16_02[] = {0x00E9, 0x00E8}; // "éè" in UTF-16
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_02, 4, utf16_02, 2));
// Test case 3: 3-byte UTF-8 sequences
const uint8_t utf8_03[] = {0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87}; // "中文" in UTF-8
const uint16_t utf16_03[] = {0x4E2D, 0x6587}; // "中文" in UTF-16
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_03, 6, utf16_03, 2));
// Test case 4: 4-byte UTF-8 sequences
const uint8_t utf8_04[] = {0xF0, 0x9F, 0x98, 0x81}; // 😁 in UTF-8
const uint16_t utf16_04[] = {0xD83D, 0xDE01}; // 😁 in UTF-16 (surrogate pair)
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_04, 4, utf16_04, 2));
// Test case 5: UTF-16 edge cases (empty strings)
const uint8_t *utf8_05 = nullptr;
const uint16_t *utf16_05 = nullptr;
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_05, 0, utf16_05, 0));
// Test case 6: UTF-8 shorter than UTF-16
const uint8_t utf8_06[] = "test"; // "test" in ASCII
const uint16_t utf16_06[] = {'t', 'e', 's', 't', '!', '!'};
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_06, 4, utf16_06, 6));
// Test case 7: UTF-8 longer than UTF-16
const uint8_t utf8_07[] = {0xF0, 0x9F, 0x98, 0x81, 0xF0, 0x9F, 0x98, 0x81}; // 😁😁 in UTF-8
const uint16_t utf16_07[] = {0xD83D, 0xDE01}; // 😁 in UTF-16 (surrogate pair)
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_07, 8, utf16_07, 2));
// Test case 8: Incomplete surrogate pair in UTF-16
const uint8_t utf8_08[] = {0xF0, 0x9F, 0x92, 0xA9}; // 😩 in UTF-8
const uint16_t utf16_08[] = {0xD83D}; // Missing low surrogate
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_08, 4, utf16_08, 1));
// Test case 9: Truncated UTF-8 multi-byte character
const uint8_t utf8_09[] = {0xE3, 0x81}; // Truncated "あ" (Japanese 'a')
const uint16_t utf16_09[] = {0x3042}; // Full "あ"
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_09, 2, utf16_09, 1));
// Test case 10: Longer UTF-8 sequence matching shorter UTF-16
const uint8_t utf8_10[] = {0xC2, 0xA3, 0xC2, 0xA3}; // "££" in UTF-8
const uint16_t utf16_10[] = {0x00A3}; // Single "£"
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_10, 4, utf16_10, 1));
// Test case 11: Handling noncharacters in both UTF-8 and UTF-16
const uint8_t utf8_11[] = {0xEF, 0xBF, 0xBE}; // UTF-8 noncharacter U+FFFE
const uint16_t utf16_11[] = {0xFFFE}; // UTF-16 noncharacter
EXPECT_TRUE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_11, 3, utf16_11, 1));
// Test case 12: Empty UTF-8 and non-empty UTF-16
const uint8_t *utf8_12 = nullptr; // Empty UTF-8
const uint16_t utf16_12[] = {0x0061}; // "a"
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_12, 0, utf16_12, 1));
// Test case 13: Non-empty UTF-8 and empty UTF-16
const uint8_t utf8_13[] = {0x61}; // "a"
const uint16_t *utf16_13 = nullptr; // Empty UTF-16
EXPECT_FALSE(EcmaStringEqualsTest::IsUtf8EqualsUtf16UT(utf8_13, 1, utf16_13, 0));
}
}