!9589 Regexp.replace do not replace emoji

Merge pull request !9589 from 贺存茂/Regexp0928
This commit is contained in:
openharmony_ci 2024-10-12 08:12:32 +00:00 committed by Gitee
commit b19f368486
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
10 changed files with 54 additions and 24 deletions

View File

@ -44,14 +44,14 @@ bool IsUTF16LowSurrogate(uint16_t ch)
}
// Methods for decode utf16 to unicode
uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index)
uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8)
{
uint16_t high = utf16[*index];
if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
return high;
}
uint16_t low = utf16[*index + 1];
if (!IsUTF16LowSurrogate(low)) {
if (!IsUTF16LowSurrogate(low) || cesu8) {
return high;
}
(*index)++;
@ -221,7 +221,7 @@ Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteB
return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
}
size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize)
size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8)
{
size_t res = 1; // zero byte
// when utf16 data length is only 1 and code in 0xd800-0xdfff,
@ -247,7 +247,7 @@ size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool
} else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
res += UtfLength::THREE;
} else {
if (i < length - 1 &&
if (!cesu8 && i < length - 1 &&
utf16[i + 1] >= utf::LO_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
utf16[i + 1] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
res += UtfLength::FOUR;
@ -261,7 +261,7 @@ size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool
}
size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
size_t start, bool modify, bool isWriteBuffer)
size_t start, bool modify, bool isWriteBuffer, bool cesu8)
{
if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
return 0;
@ -269,7 +269,7 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_
size_t utf8Pos = 0;
size_t end = start + utf16Len;
for (size_t i = start; i < end; ++i) {
uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8);
if (codepoint == 0) {
if (isWriteBuffer) {
utf8Out[utf8Pos++] = 0x00U;

View File

@ -100,7 +100,7 @@ struct Utf8Char {
static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index);
uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8 = false);
size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index);
@ -110,10 +110,12 @@ bool IsValidUTF8(const std::vector<uint8_t> &data);
Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer = false);
size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true, bool isGetBufferSize = false);
size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true,
bool isGetBufferSize = false, bool cesu8 = false);
size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
size_t start, bool modify = true, bool isWriteBuffer = false);
size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len,
size_t utf8Len, size_t start, bool modify = true,
bool isWriteBuffer = false, bool cesu = false);
size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
size_t start, bool modify = true, bool isWriteBuffer = false);

View File

@ -2361,7 +2361,8 @@ JSTaggedValue BuiltinsRegExp::RegExpInitialize(JSThread *thread, const JSHandle<
auto getCache = regExpParserCache->GetCache(*patternStrHandle, flagsBits, groupName);
if (getCache.first.IsHole()) {
// String -> CString
CString patternStdStr = ConvertToString(*patternStrHandle, StringConvertedUsage::LOGICOPERATION);
bool cesu8 = !(RegExpParser::FLAG_UTF16 & flagsBits);
CString patternStdStr = ConvertToString(*patternStrHandle, StringConvertedUsage::LOGICOPERATION, cesu8);
parser.Init(const_cast<char *>(reinterpret_cast<const char *>(patternStdStr.c_str())), patternStdStr.size(),
flagsBits);
parser.Parse();

View File

@ -1595,14 +1595,14 @@ std::string EcmaStringAccessor::DebuggerToStdString(StringConvertedUsage usage)
return res;
}
CString EcmaStringAccessor::ToCString(StringConvertedUsage usage)
CString EcmaStringAccessor::ToCString(StringConvertedUsage usage, bool cesu8)
{
if (string_ == nullptr) {
return "";
}
bool modify = (usage != StringConvertedUsage::PRINT);
CVector<uint8_t> buf;
Span<const uint8_t> sp = string_->ToUtf8Span(buf, modify);
Span<const uint8_t> sp = string_->ToUtf8Span(buf, modify, cesu8);
CString res;
res.reserve(sp.size());
for (const auto &c : sp) {

View File

@ -545,17 +545,17 @@ private:
return std::unique_ptr<uint8_t[]>(buf);
}
Span<const uint8_t> ToUtf8Span(CVector<uint8_t> &buf, bool modify = true)
Span<const uint8_t> ToUtf8Span(CVector<uint8_t> &buf, bool modify = true, bool cesu8 = false)
{
Span<const uint8_t> str;
uint32_t strLen = GetLength();
if (UNLIKELY(IsUtf16())) {
CVector<uint16_t> tmpBuf;
const uint16_t *data = EcmaString::GetUtf16DataFlat(this, tmpBuf);
ASSERT(base::utf_helper::Utf16ToUtf8Size(data, strLen, modify) > 0);
size_t len = base::utf_helper::Utf16ToUtf8Size(data, strLen, modify) - 1;
ASSERT(base::utf_helper::Utf16ToUtf8Size(data, strLen, modify, false, cesu8) > 0);
size_t len = base::utf_helper::Utf16ToUtf8Size(data, strLen, modify, false, cesu8) - 1;
buf.reserve(len);
len = base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf.data(), strLen, len, 0, modify);
len = base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf.data(), strLen, len, 0, modify, false, cesu8);
str = Span<const uint8_t>(buf.data(), len);
} else {
const uint8_t *data = EcmaString::GetUtf8DataFlat(this, buf);
@ -1250,7 +1250,7 @@ public:
std::string DebuggerToStdString(StringConvertedUsage usage = StringConvertedUsage::PRINT);
// not change string data structure.
// if string is not flat, this func has low efficiency.
CString ToCString(StringConvertedUsage usage = StringConvertedUsage::LOGICOPERATION);
CString ToCString(StringConvertedUsage usage = StringConvertedUsage::LOGICOPERATION, bool cesu8 = false);
// not change string data structure.
// if string is not flat, this func has low efficiency.

View File

@ -94,12 +94,12 @@ CString ConvertToString(const std::string &str)
return res;
}
CString ConvertToString(const EcmaString *s, StringConvertedUsage usage)
CString ConvertToString(const EcmaString *s, StringConvertedUsage usage, bool cesu8)
{
if (s == nullptr) {
return CString("");
}
return EcmaStringAccessor(const_cast<EcmaString *>(s)).ToCString(usage);
return EcmaStringAccessor(const_cast<EcmaString *>(s)).ToCString(usage, cesu8);
}
CString ConvertToString(JSTaggedValue key)

View File

@ -56,8 +56,9 @@ CString ConvertToString(const std::string &str);
std::string PUBLIC_API ConvertToStdString(const CString &str);
// '\u0000' is skip according to holdZero
// cesu8 means non-BMP1 codepoints should encode as 1 utf8 string
CString PUBLIC_API ConvertToString(const ecmascript::EcmaString *s,
StringConvertedUsage usage = StringConvertedUsage::PRINT);
StringConvertedUsage usage = StringConvertedUsage::PRINT, bool cesu8 = false);
CString ConvertToString(ecmascript::JSTaggedValue key);
template<class T>

View File

@ -514,13 +514,13 @@ void RegExpParser::ParseAlternative(bool isBackward)
uint32_t matchedChar = c0_;
if (c0_ > (INT8_MAX + 1)) {
Prev();
int i = 0;
UChar32 c;
int32_t length = end_ - pc_ + 1;
// NOLINTNEXTLINE(hicpp-signed-bitwise)
U8_NEXT(pc_, i, length, c); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
auto unicodeChar = base::utf_helper::ConvertUtf8ToUnicodeChar(pc_, length);
c = unicodeChar.first;
matchedChar = static_cast<uint32_t>(c);
pc_ += i; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
pc_ += unicodeChar.second; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
}
if (IsIgnoreCase()) {
matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));

View File

@ -11,6 +11,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"b\ude00"
"bb"
"b"
"b\ude00"
true
true
true

View File

@ -19,6 +19,28 @@
* @tc.type: FUNC
* @tc.require: issueI5NO8G
*/
{
let str = "😀";
let regexp = /[😀]/;
print(JSON.stringify(str.replace(regexp,"b")));
}
{
let str = "😀";
let regexp = /[😀]/g;
print(JSON.stringify(str.replace(regexp,"b")));
}
{
let str = "😀";
let regexp = /[😀]/u;
print(JSON.stringify(str.replace(regexp,"b")));
}
{
let str = "😀";
let regexp = /[\😀]/;
print(JSON.stringify(str.replace(regexp,"b")));
}
var reg = /[\x5d-\x7e]/i;
var result = reg.test("a");
print(result);