!9589 Regexp.replace do not replace emoji

Merge pull request !9589 from 贺存茂/Regexp0928
2024-11-23 10:09:54 +00:00 · 2024-10-12 08:12:32 +00:00 · 2024-10-12 08:12:32 +00:00 · b19f368486
commit b19f368486
parent 59cb3b3fa8 7ee26c3852
10 changed files with 54 additions and 24 deletions
--- a/ecmascript/base/utf_helper.cpp
+++ b/ecmascript/base/utf_helper.cpp
@ -44,14 +44,14 @@ bool IsUTF16LowSurrogate(uint16_t ch)
 }

 // Methods for decode utf16 to unicode
-uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index)
+uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8)
 {
    uint16_t high = utf16[*index];
    if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
        return high;
    }
    uint16_t low = utf16[*index + 1];
-    if (!IsUTF16LowSurrogate(low)) {
+    if (!IsUTF16LowSurrogate(low) || cesu8) {
        return high;
    }
    (*index)++;
@ -221,7 +221,7 @@ Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteB
    return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
 }

-size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize)
+size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8)
 {
    size_t res = 1;  // zero byte
    // when utf16 data length is only 1 and code in 0xd800-0xdfff,
@ -247,7 +247,7 @@ size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool
        } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
            res += UtfLength::THREE;
        } else {
-            if (i < length - 1 &&
+            if (!cesu8 && i < length - 1 &&
                utf16[i + 1] >= utf::LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
                utf16[i + 1] <= utf::LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
                res += UtfLength::FOUR;
@ -261,7 +261,7 @@ size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool
 }

 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
-                                size_t start, bool modify, bool isWriteBuffer)
+                                size_t start, bool modify, bool isWriteBuffer, bool cesu8)
 {
    if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
        return 0;
@ -269,7 +269,7 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_
    size_t utf8Pos = 0;
    size_t end = start + utf16Len;
    for (size_t i = start; i < end; ++i) {
-        uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
+        uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8);
        if (codepoint == 0) {
            if (isWriteBuffer) {
                utf8Out[utf8Pos++] = 0x00U;
--- a/ecmascript/base/utf_helper.h
+++ b/ecmascript/base/utf_helper.h
@ -100,7 +100,7 @@ struct Utf8Char {

 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};

-uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index);
+uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8 = false);

 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index);

@ -110,10 +110,12 @@ bool IsValidUTF8(const std::vector<uint8_t> &data);

 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer = false);

-size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true, bool isGetBufferSize = false);
+size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true,
+                       bool isGetBufferSize = false, bool cesu8 = false);

-size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
-                                           size_t start, bool modify = true, bool isWriteBuffer = false);
+size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len,
+                                           size_t utf8Len, size_t start, bool modify = true,
+                                           bool isWriteBuffer = false, bool cesu = false);

 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
                                        size_t start, bool modify = true, bool isWriteBuffer = false);
--- a/ecmascript/builtins/builtins_regexp.cpp
+++ b/ecmascript/builtins/builtins_regexp.cpp
@ -2361,7 +2361,8 @@ JSTaggedValue BuiltinsRegExp::RegExpInitialize(JSThread *thread, const JSHandle<
    auto getCache = regExpParserCache->GetCache(*patternStrHandle, flagsBits, groupName);
    if (getCache.first.IsHole()) {
        // String -> CString
-        CString patternStdStr = ConvertToString(*patternStrHandle, StringConvertedUsage::LOGICOPERATION);
+        bool cesu8 = !(RegExpParser::FLAG_UTF16 & flagsBits);
+        CString patternStdStr = ConvertToString(*patternStrHandle, StringConvertedUsage::LOGICOPERATION, cesu8);
        parser.Init(const_cast<char *>(reinterpret_cast<const char *>(patternStdStr.c_str())), patternStdStr.size(),
                    flagsBits);
        parser.Parse();
--- a/ecmascript/ecma_string.cpp
+++ b/ecmascript/ecma_string.cpp
@ -1595,14 +1595,14 @@ std::string EcmaStringAccessor::DebuggerToStdString(StringConvertedUsage usage)
    return res;
 }

-CString EcmaStringAccessor::ToCString(StringConvertedUsage usage)
+CString EcmaStringAccessor::ToCString(StringConvertedUsage usage, bool cesu8)
 {
    if (string_ == nullptr) {
        return "";
    }
    bool modify = (usage != StringConvertedUsage::PRINT);
    CVector<uint8_t> buf;
-    Span<const uint8_t> sp = string_->ToUtf8Span(buf, modify);
+    Span<const uint8_t> sp = string_->ToUtf8Span(buf, modify, cesu8);
    CString res;
    res.reserve(sp.size());
    for (const auto &c : sp) {
--- a/ecmascript/ecma_string.h
+++ b/ecmascript/ecma_string.h
@ -545,17 +545,17 @@ private:
        return std::unique_ptr<uint8_t[]>(buf);
    }

-    Span<const uint8_t> ToUtf8Span(CVector<uint8_t> &buf, bool modify = true)
+    Span<const uint8_t> ToUtf8Span(CVector<uint8_t> &buf, bool modify = true, bool cesu8 = false)
    {
        Span<const uint8_t> str;
        uint32_t strLen = GetLength();
        if (UNLIKELY(IsUtf16())) {
            CVector<uint16_t> tmpBuf;
            const uint16_t *data = EcmaString::GetUtf16DataFlat(this, tmpBuf);
-            ASSERT(base::utf_helper::Utf16ToUtf8Size(data, strLen, modify) > 0);
-            size_t len = base::utf_helper::Utf16ToUtf8Size(data, strLen, modify) - 1;
+            ASSERT(base::utf_helper::Utf16ToUtf8Size(data, strLen, modify, false, cesu8) > 0);
+            size_t len = base::utf_helper::Utf16ToUtf8Size(data, strLen, modify, false, cesu8) - 1;
            buf.reserve(len);
-            len = base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf.data(), strLen, len, 0, modify);
+            len = base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf.data(), strLen, len, 0, modify, false, cesu8);
            str = Span<const uint8_t>(buf.data(), len);
        } else {
            const uint8_t *data = EcmaString::GetUtf8DataFlat(this, buf);
@ -1250,7 +1250,7 @@ public:
    std::string DebuggerToStdString(StringConvertedUsage usage = StringConvertedUsage::PRINT);
    // not change string data structure.
    // if string is not flat, this func has low efficiency.
-    CString ToCString(StringConvertedUsage usage = StringConvertedUsage::LOGICOPERATION);
+    CString ToCString(StringConvertedUsage usage = StringConvertedUsage::LOGICOPERATION, bool cesu8 = false);

    // not change string data structure.
    // if string is not flat, this func has low efficiency.
--- a/ecmascript/mem/c_string.cpp
+++ b/ecmascript/mem/c_string.cpp
@ -94,12 +94,12 @@ CString ConvertToString(const std::string &str)
    return res;
 }

-CString ConvertToString(const EcmaString *s, StringConvertedUsage usage)
+CString ConvertToString(const EcmaString *s, StringConvertedUsage usage, bool cesu8)
 {
    if (s == nullptr) {
        return CString("");
    }
-    return EcmaStringAccessor(const_cast<EcmaString *>(s)).ToCString(usage);
+    return EcmaStringAccessor(const_cast<EcmaString *>(s)).ToCString(usage, cesu8);
 }

 CString ConvertToString(JSTaggedValue key)
--- a/ecmascript/mem/c_string.h
+++ b/ecmascript/mem/c_string.h
@ -56,8 +56,9 @@ CString ConvertToString(const std::string &str);
 std::string PUBLIC_API ConvertToStdString(const CString &str);

 // '\u0000' is skip according to holdZero
+// cesu8 means non-BMP1 codepoints should encode as 1 utf8 string
 CString PUBLIC_API ConvertToString(const ecmascript::EcmaString *s,
-    StringConvertedUsage usage = StringConvertedUsage::PRINT);
+    StringConvertedUsage usage = StringConvertedUsage::PRINT, bool cesu8 = false);
 CString ConvertToString(ecmascript::JSTaggedValue key);

 template<class T>
--- a/ecmascript/regexp/regexp_parser.cpp
+++ b/ecmascript/regexp/regexp_parser.cpp
@ -514,13 +514,13 @@ void RegExpParser::ParseAlternative(bool isBackward)
                    uint32_t matchedChar = c0_;
                    if (c0_ > (INT8_MAX + 1)) {
                        Prev();
-                        int i = 0;
                        UChar32 c;
                        int32_t length = end_ - pc_ + 1;
                        // NOLINTNEXTLINE(hicpp-signed-bitwise)
-                        U8_NEXT(pc_, i, length, c);  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+                        auto unicodeChar = base::utf_helper::ConvertUtf8ToUnicodeChar(pc_, length);
+                        c = unicodeChar.first;
                        matchedChar = static_cast<uint32_t>(c);
-                        pc_ += i;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+                        pc_ += unicodeChar.second;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
                    }
                    if (IsIgnoreCase()) {
                        matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
--- a/test/moduletest/regexp/expect_output.txt
+++ b/test/moduletest/regexp/expect_output.txt
@ -11,6 +11,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+"b\ude00"
+"bb"
+"b"
+"b\ude00"
 true
 true
 true
--- a/test/moduletest/regexp/regexp.js
+++ b/test/moduletest/regexp/regexp.js
@ -19,6 +19,28 @@
 * @tc.type: FUNC
 * @tc.require: issueI5NO8G
 */
+{
+  let str = "😀";
+  let regexp = /[😀]/;
+  print(JSON.stringify(str.replace(regexp,"b")));
+}
+{
+  let str = "😀";
+  let regexp = /[😀]/g;
+  print(JSON.stringify(str.replace(regexp,"b")));
+}
+{
+  let str = "😀";
+  let regexp = /[😀]/u;
+  print(JSON.stringify(str.replace(regexp,"b")));
+}
+{
+  let str = "😀";
+  let regexp = /[\😀]/;
+  print(JSON.stringify(str.replace(regexp,"b")));
+}
+
+
 var reg = /[\x5d-\x7e]/i;
 var result = reg.test("a");
 print(result);