diff --git a/ecmascript/base/utf_helper.cpp b/ecmascript/base/utf_helper.cpp index f736e47200..3814200fa7 100644 --- a/ecmascript/base/utf_helper.cpp +++ b/ecmascript/base/utf_helper.cpp @@ -44,14 +44,14 @@ bool IsUTF16LowSurrogate(uint16_t ch) } // Methods for decode utf16 to unicode -uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index) +uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8) { uint16_t high = utf16[*index]; if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) { return high; } uint16_t low = utf16[*index + 1]; - if (!IsUTF16LowSurrogate(low)) { + if (!IsUTF16LowSurrogate(low) || cesu8) { return high; } (*index)++; @@ -221,7 +221,7 @@ Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteB return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}}; } -size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize) +size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8) { size_t res = 1; // zero byte // when utf16 data length is only 1 and code in 0xd800-0xdfff, @@ -247,7 +247,7 @@ size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) { res += UtfLength::THREE; } else { - if (i < length - 1 && + if (!cesu8 && i < length - 1 && utf16[i + 1] >= utf::LO_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) utf16[i + 1] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) res += UtfLength::FOUR; @@ -261,7 +261,7 @@ size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool } size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, - size_t start, bool modify, bool isWriteBuffer) + size_t start, bool modify, bool isWriteBuffer, bool cesu8) { if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { return 0; @@ -269,7 +269,7 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_ size_t utf8Pos = 0; size_t end = start + utf16Len; for (size_t i = start; i < end; ++i) { - uint32_t codepoint = DecodeUTF16(utf16In, end, &i); + uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8); if (codepoint == 0) { if (isWriteBuffer) { utf8Out[utf8Pos++] = 0x00U; diff --git a/ecmascript/base/utf_helper.h b/ecmascript/base/utf_helper.h index 9e5468194c..7216b8e548 100644 --- a/ecmascript/base/utf_helper.h +++ b/ecmascript/base/utf_helper.h @@ -100,7 +100,7 @@ struct Utf8Char { static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; -uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index); +uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8 = false); size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index); @@ -110,10 +110,12 @@ bool IsValidUTF8(const std::vector &data); Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer = false); -size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true, bool isGetBufferSize = false); +size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true, + bool isGetBufferSize = false, bool cesu8 = false); -size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, - size_t start, bool modify = true, bool isWriteBuffer = false); +size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, + size_t utf8Len, size_t start, bool modify = true, + bool isWriteBuffer = false, bool cesu = false); size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, size_t start, bool modify = true, bool isWriteBuffer = false); diff --git a/ecmascript/builtins/builtins_regexp.cpp b/ecmascript/builtins/builtins_regexp.cpp index c8cd74836c..b63df51bbc 100644 --- a/ecmascript/builtins/builtins_regexp.cpp +++ b/ecmascript/builtins/builtins_regexp.cpp @@ -2361,7 +2361,8 @@ JSTaggedValue BuiltinsRegExp::RegExpInitialize(JSThread *thread, const JSHandle< auto getCache = regExpParserCache->GetCache(*patternStrHandle, flagsBits, groupName); if (getCache.first.IsHole()) { // String -> CString - CString patternStdStr = ConvertToString(*patternStrHandle, StringConvertedUsage::LOGICOPERATION); + bool cesu8 = !(RegExpParser::FLAG_UTF16 & flagsBits); + CString patternStdStr = ConvertToString(*patternStrHandle, StringConvertedUsage::LOGICOPERATION, cesu8); parser.Init(const_cast(reinterpret_cast(patternStdStr.c_str())), patternStdStr.size(), flagsBits); parser.Parse(); diff --git a/ecmascript/ecma_string.cpp b/ecmascript/ecma_string.cpp index 5464b5445b..9109cab901 100755 --- a/ecmascript/ecma_string.cpp +++ b/ecmascript/ecma_string.cpp @@ -1595,14 +1595,14 @@ std::string EcmaStringAccessor::DebuggerToStdString(StringConvertedUsage usage) return res; } -CString EcmaStringAccessor::ToCString(StringConvertedUsage usage) +CString EcmaStringAccessor::ToCString(StringConvertedUsage usage, bool cesu8) { if (string_ == nullptr) { return ""; } bool modify = (usage != StringConvertedUsage::PRINT); CVector buf; - Span sp = string_->ToUtf8Span(buf, modify); + Span sp = string_->ToUtf8Span(buf, modify, cesu8); CString res; res.reserve(sp.size()); for (const auto &c : sp) { diff --git a/ecmascript/ecma_string.h b/ecmascript/ecma_string.h index b5b954ea48..944b19f31d 100755 --- a/ecmascript/ecma_string.h +++ b/ecmascript/ecma_string.h @@ -553,17 +553,17 @@ private: return std::unique_ptr(buf); } - Span ToUtf8Span(CVector &buf, bool modify = true) + Span ToUtf8Span(CVector &buf, bool modify = true, bool cesu8 = false) { Span str; uint32_t strLen = GetLength(); if (UNLIKELY(IsUtf16())) { CVector tmpBuf; const uint16_t *data = EcmaString::GetUtf16DataFlat(this, tmpBuf); - ASSERT(base::utf_helper::Utf16ToUtf8Size(data, strLen, modify) > 0); - size_t len = base::utf_helper::Utf16ToUtf8Size(data, strLen, modify) - 1; + ASSERT(base::utf_helper::Utf16ToUtf8Size(data, strLen, modify, false, cesu8) > 0); + size_t len = base::utf_helper::Utf16ToUtf8Size(data, strLen, modify, false, cesu8) - 1; buf.reserve(len); - len = base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf.data(), strLen, len, 0, modify); + len = base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf.data(), strLen, len, 0, modify, false, cesu8); str = Span(buf.data(), len); } else { const uint8_t *data = EcmaString::GetUtf8DataFlat(this, buf); @@ -1258,7 +1258,7 @@ public: std::string DebuggerToStdString(StringConvertedUsage usage = StringConvertedUsage::PRINT); // not change string data structure. // if string is not flat, this func has low efficiency. - CString ToCString(StringConvertedUsage usage = StringConvertedUsage::LOGICOPERATION); + CString ToCString(StringConvertedUsage usage = StringConvertedUsage::LOGICOPERATION, bool cesu8 = false); // not change string data structure. // if string is not flat, this func has low efficiency. diff --git a/ecmascript/mem/c_string.cpp b/ecmascript/mem/c_string.cpp index f9e6458888..2b4b7fd2bb 100644 --- a/ecmascript/mem/c_string.cpp +++ b/ecmascript/mem/c_string.cpp @@ -94,12 +94,12 @@ CString ConvertToString(const std::string &str) return res; } -CString ConvertToString(const EcmaString *s, StringConvertedUsage usage) +CString ConvertToString(const EcmaString *s, StringConvertedUsage usage, bool cesu8) { if (s == nullptr) { return CString(""); } - return EcmaStringAccessor(const_cast(s)).ToCString(usage); + return EcmaStringAccessor(const_cast(s)).ToCString(usage, cesu8); } CString ConvertToString(JSTaggedValue key) diff --git a/ecmascript/mem/c_string.h b/ecmascript/mem/c_string.h index 51a08f8e87..77761bfdbf 100644 --- a/ecmascript/mem/c_string.h +++ b/ecmascript/mem/c_string.h @@ -56,8 +56,9 @@ CString ConvertToString(const std::string &str); std::string PUBLIC_API ConvertToStdString(const CString &str); // '\u0000' is skip according to holdZero +// cesu8 means non-BMP1 codepoints should encode as 1 utf8 string CString PUBLIC_API ConvertToString(const ecmascript::EcmaString *s, - StringConvertedUsage usage = StringConvertedUsage::PRINT); + StringConvertedUsage usage = StringConvertedUsage::PRINT, bool cesu8 = false); CString ConvertToString(ecmascript::JSTaggedValue key); template diff --git a/ecmascript/regexp/regexp_parser.cpp b/ecmascript/regexp/regexp_parser.cpp index d186f19e52..afd732584d 100644 --- a/ecmascript/regexp/regexp_parser.cpp +++ b/ecmascript/regexp/regexp_parser.cpp @@ -514,13 +514,13 @@ void RegExpParser::ParseAlternative(bool isBackward) uint32_t matchedChar = c0_; if (c0_ > (INT8_MAX + 1)) { Prev(); - int i = 0; UChar32 c; int32_t length = end_ - pc_ + 1; // NOLINTNEXTLINE(hicpp-signed-bitwise) - U8_NEXT(pc_, i, length, c); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + auto unicodeChar = base::utf_helper::ConvertUtf8ToUnicodeChar(pc_, length); + c = unicodeChar.first; matchedChar = static_cast(c); - pc_ += i; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + pc_ += unicodeChar.second; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) } if (IsIgnoreCase()) { matchedChar = static_cast(Canonicalize(static_cast(matchedChar), IsUtf16())); diff --git a/test/moduletest/regexp/expect_output.txt b/test/moduletest/regexp/expect_output.txt index 9aa2cc2062..aefdada78e 100644 --- a/test/moduletest/regexp/expect_output.txt +++ b/test/moduletest/regexp/expect_output.txt @@ -11,6 +11,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +"b\ude00" +"bb" +"b" +"b\ude00" true true true diff --git a/test/moduletest/regexp/regexp.js b/test/moduletest/regexp/regexp.js index a16bb6b3ac..b0ddd09353 100644 --- a/test/moduletest/regexp/regexp.js +++ b/test/moduletest/regexp/regexp.js @@ -19,6 +19,28 @@ * @tc.type: FUNC * @tc.require: issueI5NO8G */ +{ + let str = "😀"; + let regexp = /[😀]/; + print(JSON.stringify(str.replace(regexp,"b"))); +} +{ + let str = "😀"; + let regexp = /[😀]/g; + print(JSON.stringify(str.replace(regexp,"b"))); +} +{ + let str = "😀"; + let regexp = /[😀]/u; + print(JSON.stringify(str.replace(regexp,"b"))); +} +{ + let str = "😀"; + let regexp = /[\😀]/; + print(JSON.stringify(str.replace(regexp,"b"))); +} + + var reg = /[\x5d-\x7e]/i; var result = reg.test("a"); print(result);