add bound check for ConvertRegionUtf8ToUtf16

Issue: https://gitee.com/openharmony/arkcompiler_ets_runtime/issues/IAR8ZY
Signed-off-by: ZhouGuangyuan <zhouguangyuan1@huawei.com>
Change-Id: Ia74e4e7667af6dbc4b06afadcf1b407fca2a7dc9
This commit is contained in:
zhouguangyuan 2024-09-11 12:33:06 +08:00
parent fa4a5954f7
commit cb362d2f90
7 changed files with 60 additions and 77 deletions

View File

@ -96,7 +96,7 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end)
++size;
utf8Bit >>= 1UL;
}
if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) {
if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1) <= 0) {
return true;
}
}

View File

@ -557,7 +557,6 @@ HWTEST_F_L0(UtfHelperTest, DebuggerConvertRegionUtf16ToUtf8)
HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16)
{
size_t utf16Len = 100;
size_t start = 0;
uint8_t utf8Value[10] = {
0x7F, // 1-length UTF16 encoding
0xDF, 0xBF, // 1-length UTF16 encoding
@ -565,25 +564,13 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16)
0xF4, 0x8F, 0xBF, 0xBF}; // 2-length UTF16 encoding
const uint8_t *utf8ValuePtr = utf8Value;
uint16_t *utf16Out = (uint16_t*)malloc(utf16Len);
size_t outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
size_t outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len);
// 1 + 1 + 1 + 2 = 5s
EXPECT_EQ(outPos, 5U);
// 1 + 2 = 3
start = 3;
outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
utf8ValuePtr = utf8Value + 3;
outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value) - 3, utf16Len);
EXPECT_EQ(outPos, 3U);
// When "start" is in the middle of a symbol sequence
start = 2;
outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
EXPECT_EQ(outPos, 0U);
start = 4;
outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
EXPECT_EQ(outPos, 0U);
start = 7;
outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len, start);
EXPECT_EQ(outPos, 0U);
free(utf16Out);
}
/*
@ -852,7 +839,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_001) {
std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F}; // "Hello"
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -867,7 +854,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_002) {
std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D, 0xFF0C, 0x4E16, 0x754C, 0xFF01}; // "你好,世界!"
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -882,7 +869,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_003) {
std::vector<uint16_t> expected_utf16 = {}; // Empty
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -897,7 +884,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_004) {
std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002C, 0x20, 0x4F60};
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), 10, utf16.size(), 0); // Only process the first 9 bytes
utf16.data(), 10, utf16.size()); // Only process the first 9 bytes
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -912,7 +899,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_005) {
std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D}; // "你好"
std::vector<uint16_t> utf16(2); // Limit buffer length
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -927,7 +914,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_006) {
std::vector<uint16_t> expected_utf16 = {}; // Expected empty output, handling erroneous data
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_NE(utf16, expected_utf16);
}
@ -942,7 +929,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_007) {
std::vector<uint16_t> expected_utf16 = {0x0041, 0x0042, 0x0043}; // ASCII characters: A, B, C
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -957,7 +944,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_008) {
std::vector<uint16_t> expected_utf16 = {0x00A2, 0x00FC}; // Unicode .
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -972,7 +959,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_009) {
std::vector<uint16_t> expected_utf16 = {0x20AC}; // Unicode .
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -987,7 +974,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_010) {
std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -1003,7 +990,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_011) {
0x0000, 0x0057, 0x006F, 0x0072, 0x006C, 0x0064}; // Including NULL characters
std::vector<uint16_t> utf16(15);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8Nul.data()),
utf16.data(), utf8Nul.size(), utf16.size(), 0);
utf16.data(), utf8Nul.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(utf16, expected_utf16);
}
@ -1018,7 +1005,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_012) {
std::vector<uint16_t> expected_utf16 = {};
std::vector<uint16_t> utf16(10);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_NE(utf16, expected_utf16);
}
@ -1033,7 +1020,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_013) {
std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
std::vector<uint16_t> utf16(0);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(converted, 0);
}
@ -1047,7 +1034,7 @@ HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_014) {
std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
std::vector<uint16_t> utf16(1);
size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
utf16.data(), utf8.size(), utf16.size(), 0);
utf16.data(), utf8.size(), utf16.size());
utf16.resize(converted);
EXPECT_EQ(converted, 0);
}

View File

@ -350,11 +350,31 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com
return {pair, UtfLength::FOUR};
}
// drop the tail bytes if the remain length can't fill the length it represents.
static inline size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len)
{
size_t trimSize = 0;
if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) {
// The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one.
trimSize = 1;
}
if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) {
// The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two.
trimSize = CONST_2;
}
if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) {
// The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three.
trimSize = CONST_3;
}
return utf8Len - trimSize;
}
size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
{
size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len);
size_t in_pos = 0;
size_t res = 0;
while (in_pos < utf8Len) {
while (in_pos < safeUtf8Len) {
uint8_t src = utf8[in_pos];
switch (src & 0xF0) {
case 0xF0: {
@ -386,40 +406,21 @@ size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
do {
in_pos++;
res++;
} while (in_pos < utf8Len && utf8[in_pos] < 0x80);
} while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80);
break;
}
}
// The remain chars should be treated as single byte char.
res += utf8Len - in_pos;
return res;
}
size_t ConvertUtf8ToUtf16Int(const uint8_t data, size_t max_bytes)
{
if ((data & MASK1) == 0 || max_bytes < CONST_4) {
return 1;
}
if ((data & MASK2) == 0) {
return CONST_2;
}
if ((data & MASK3) == 0) {
return CONST_3;
}
return CONST_4;
}
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
size_t start)
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)
{
size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len);
size_t in_pos = 0;
size_t out_pos = 0;
while (in_pos < utf8Len && start > 0) {
auto nbytes = ConvertUtf8ToUtf16Int(utf8In[in_pos], utf8Len - in_pos);
in_pos += nbytes;
start -= nbytes;
}
while (in_pos < utf8Len && out_pos < utf16Len) {
while (in_pos < safeUtf8Len && out_pos < utf16Len) {
uint8_t src = utf8In[in_pos];
switch (src & 0xF0) {
case 0xF0: {
@ -460,10 +461,14 @@ size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_
default:
do {
utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
} while (in_pos < utf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
} while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
break;
}
}
// The remain chars should be treated as single byte char.
while (in_pos < utf8Len && out_pos < utf16Len) {
utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
}
return out_pos;
}

View File

@ -122,8 +122,7 @@ std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com
size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
size_t start);
size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len);
size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len);

View File

@ -58,7 +58,7 @@ inline EcmaString *EcmaString::CreateFromUtf8(const EcmaVM *vm, const uint8_t *u
ASSERT(string != nullptr);
[[maybe_unused]] auto len =
base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len, 0);
base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len);
ASSERT(len == utf16Len);
}

View File

@ -821,7 +821,7 @@ uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len
auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len);
CVector<uint16_t> tmpBuffer(utf16Len);
[[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len,
utf16Len, 0);
utf16Len);
ASSERT(len == utf16Len);
uint32_t hash = ComputeHashForData(tmpBuffer.data(), utf16Len, 0);
return MixHashcode(hash, NOT_INTEGER);

View File

@ -435,9 +435,9 @@ private:
CVector<uint8_t> tmpBuf;
const uint8_t *data = EcmaString::GetUtf8DataFlat(this, tmpBuf);
if (length > bufLength) {
return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, bufLength, bufLength, 0);
return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, bufLength, bufLength);
}
return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, length, bufLength, 0);
return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, length, bufLength);
}
// It allows user to copy into buffer even if maxLength < length
@ -510,31 +510,23 @@ private:
inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) const
{
return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength);
}
uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength) const
{
uint32_t length = GetLength();
if (length > maxLength) {
return 0;
}
uint32_t len = GetLength();
if (start + length > len) {
return 0;
}
if (IsUtf16()) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
CVector<uint16_t> tmpBuf;
const uint16_t *data = EcmaString::GetUtf16DataFlat(this, tmpBuf);
if (memcpy_s(buf, maxLength * sizeof(uint16_t), data + start, length * sizeof(uint16_t)) != EOK) {
const uint16_t *data = GetUtf16DataFlat(this, tmpBuf);
if (memcpy_s(buf, maxLength * sizeof(uint16_t), data, length * sizeof(uint16_t)) != EOK) {
LOG_FULL(FATAL) << "memcpy_s failed";
UNREACHABLE();
}
return length;
}
CVector<uint8_t> tmpBuf;
const uint8_t *data = EcmaString::GetUtf8DataFlat(this, tmpBuf);
return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, len, maxLength, start);
const uint8_t *data = GetUtf8DataFlat(this, tmpBuf);
return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, length, maxLength);
}
std::u16string ToU16String(uint32_t len = 0);