diff --git a/ecmascript/base/string_helper.h b/ecmascript/base/string_helper.h index 1bf2d44542..b745f73ae2 100644 --- a/ecmascript/base/string_helper.h +++ b/ecmascript/base/string_helper.h @@ -39,7 +39,15 @@ static constexpr uint16_t SPACE_OR_LINE_TERMINAL[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF, }; - +static constexpr int UICODE_FROM_UTF8[] = { + 0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd, +}; +static constexpr int UTF8_MIN_CODE[] = { + 0x80, 0x800, 0x10000, 0x00200000, 0x04000000, +}; +static constexpr char UTF8_FIRST_CODE[] = { + 0x1f, 0xf, 0x7, 0x3, 0x1, +}; class StringHelper { public: static std::string ToStdString(EcmaString *string); @@ -175,7 +183,51 @@ public: static EcmaString *Repeat(JSThread *thread, const std::u16string &thisStr, int32_t repeatLen, bool canBeCompress); - static EcmaString *Trim(JSThread *thread, const std::u16string &thisStr); + static int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp) + { + int c = *p++; + if (c < UICODE_FROM_UTF8[0]) { + *pp = p; + return c; + } + int l = 0; + if (c >= UICODE_FROM_UTF8[1] && c <= UICODE_FROM_UTF8[2]) { // 1 - 2: 0000 0080 - 0000 07FF + l = 1; // 1: 0000 0080 - 0000 07FF Unicode + } else if (c >= UICODE_FROM_UTF8[3] && c <= UICODE_FROM_UTF8[4]) { // 3 - 4: 0000 0800 - 0000 FFFF + l = 2; // 2: 0000 0800 - 0000 FFFF Unicode + } else if (c >= UICODE_FROM_UTF8[5] && c <= UICODE_FROM_UTF8[6]) { // 5 - 6: 0001 0000 - 0010 FFFF + l = 3; // 3: 0001 0000 - 0010 FFFF Unicode + } else if (c >= UICODE_FROM_UTF8[7] && c <= UICODE_FROM_UTF8[8]) { // 7 - 8: 0020 0000 - 03FF FFFF + l = 4; // 4: 0020 0000 - 03FF FFFF Unicode + } else if (c == UICODE_FROM_UTF8[9] || c == UICODE_FROM_UTF8[10]) { // 9 - 10: 0400 0000 - 7FFF FFFF + l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode + } else { + return -1; + } + /* check that we have enough characters */ + if (l > (maxLen - 1)) + return -1; + + return FromUtf8(c, l, p, pp); + } + + static int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp) + { + int b; + c &= UTF8_FIRST_CODE[l - 1]; + for (int i = 0; i < l; i++) { + b = *p++; + if (b < utf_helper::UTF8_2B_SECOND || b >= utf_helper::UTF8_2B_FIRST) { + return -1; + } + c = (c << 6) | (b & utf_helper::UTF8_2B_THIRD); // 6: Maximum Unicode range + } + if (c < UTF8_MIN_CODE[l - 1]) { + return -1; + } + *pp = p; + return c; + } static inline std::u16string Append(const std::u16string &str1, const std::u16string &str2) { diff --git a/ecmascript/base/utf_helper.h b/ecmascript/base/utf_helper.h index 64df9d09ce..858405d4bc 100644 --- a/ecmascript/base/utf_helper.h +++ b/ecmascript/base/utf_helper.h @@ -40,6 +40,7 @@ static constexpr uint8_t UTF8_1B_MAX = 0x7f; static constexpr uint16_t UTF8_2B_MAX = 0x7ff; static constexpr uint8_t UTF8_2B_FIRST = 0xc0; static constexpr uint8_t UTF8_2B_SECOND = 0x80; +static constexpr uint8_t UTF8_2B_THIRD = 0x3f; static constexpr uint8_t UTF8_3B_FIRST = 0xe0; static constexpr uint8_t UTF8_3B_SECOND = 0x80; diff --git a/ecmascript/builtins.cpp b/ecmascript/builtins.cpp index fc97bea562..d9650f9664 100644 --- a/ecmascript/builtins.cpp +++ b/ecmascript/builtins.cpp @@ -1583,6 +1583,7 @@ void Builtins::InitializeString(const JSHandle &env, const JSHandle groupsKey = globalConst->GetHandledGroupsString(); + JSTaggedValue named = + FastRuntimeStub::FastGetPropertyByValue(thread, resultValues.GetTaggedValue(), groupsKey.GetTaggedValue()); + JSHandle namedCaptures(thread, named); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); // m. If functionalReplace is true, then CString replacement; + int emptyArrLength = 0; + if (namedCaptures->IsUndefined()) { + emptyArrLength = 3; // 3: «matched, pos, and string» + } else { + emptyArrLength = 4; // 4: «matched, pos, string, and groups» + } JSHandle replacerArgs = - factory->NewTaggedArray(3 + capturesList->GetLength()); // 3: «matched, pos, and string» + factory->NewTaggedArray(emptyArrLength + capturesList->GetLength()); if (functionalReplace) { // i. Let replacerArgs be «matched». replacerArgs->Set(thread, 0, getMatchString.GetTaggedValue()); @@ -929,6 +942,9 @@ JSTaggedValue BuiltinsRegExp::Replace(EcmaRuntimeCallInfo *argv) } replacerArgs->Set(thread, index + 1, JSTaggedValue(position)); replacerArgs->Set(thread, index + 2, inputStr.GetTaggedValue()); // 2: position of string + if (!namedCaptures->IsUndefined()) { + replacerArgs->Set(thread, index + 3, namedCaptures.GetTaggedValue()); // 3: position of groups + } // iv. Let replValue be Call(replaceValue, undefined, replacerArgs). const size_t argsLength = replacerArgs->GetLength(); JSHandle undefined = globalConst->GetHandledUndefined(); @@ -944,8 +960,14 @@ JSTaggedValue BuiltinsRegExp::Replace(EcmaRuntimeCallInfo *argv) replacement = ConvertToString(*replacementString, StringConvertedUsage::LOGICOPERATION); } else { // n. Else, + if (!namedCaptures->IsUndefined()) { + JSHandle namedCapturesObj = JSTaggedValue::ToObject(thread, namedCaptures); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + namedCaptures = JSHandle::Cast(namedCapturesObj); + } JSHandle replacementHandle( - thread, BuiltinsString::GetSubstitution(thread, matchString, srcString, position, capturesList, + thread, BuiltinsString::GetSubstitution(thread, matchString, srcString, + position, capturesList, namedCaptures, replaceValueHandle)); replacement = ConvertToString(EcmaString::Cast(replacementHandle->GetTaggedObject()), StringConvertedUsage::LOGICOPERATION); @@ -1350,7 +1372,6 @@ bool BuiltinsRegExp::GetFlagsInternal(JSThread *thread, const JSHandle(regexpObj->GetOriginalFlags().GetInt()); return flags & mask; } - // 21.2.5.2.2 JSTaggedValue BuiltinsRegExp::RegExpBuiltinExec(JSThread *thread, const JSHandle ®exp, const JSHandle &inputStr, bool useCache) @@ -1389,14 +1410,6 @@ JSTaggedValue BuiltinsRegExp::RegExpBuiltinExec(JSThread *thread, const JSHandle JSMutableHandle flags(thread, regexpObj->GetOriginalFlags()); JSHandle cacheTable(thread->GetEcmaVM()->GetRegExpCache()); - if (lastIndex == 0 && useCache) { - JSTaggedValue cacheResult = - cacheTable->FindCachedResult(thread, pattern, flags, inputStr, RegExpExecResultCache::EXEC_TYPE, regexp); - if (cacheResult != JSTaggedValue::Undefined()) { - return cacheResult; - } - } - uint32_t length = static_cast(inputStr->GetTaggedObject())->GetLength(); uint8_t flagsBits = static_cast(regexpObj->GetOriginalFlags().GetInt()); JSHandle flagsValue(thread, FlagsBitsToString(thread, flagsBits)); @@ -1457,6 +1470,17 @@ JSTaggedValue BuiltinsRegExp::RegExpBuiltinExec(JSThread *thread, const JSHandle // 27. Perform CreateDataProperty(A, "0", matched_substr). JSHandle zeroValue(matchResult.captures_[0].second); JSObject::CreateDataProperty(thread, results, 0, zeroValue); + ObjectFactory *factory = thread->GetEcmaVM()->GetFactory(); + + JSHandle groupName(thread, regexpObj->GetGroupName()); + JSMutableHandle groups(thread, JSTaggedValue::Undefined()); + if (!groupName->IsUndefined()) { + JSHandle nullHandle(thread, JSTaggedValue::Null()); + JSHandle nullObj = factory->OrdinaryNewJSObjectCreate(nullHandle); + groups.Update(nullObj.GetTaggedValue()); + } + JSHandle groupsKey = globalConst->GetHandledGroupsString(); + JSObject::CreateDataProperty(thread, results, groupsKey, groups); // 28. For each integer i such that i > 0 and i <= n for (uint32_t i = 1; i < capturesSize; i++) { // a. Let capture_i be ith element of r's captures List @@ -1468,6 +1492,14 @@ JSTaggedValue BuiltinsRegExp::RegExpBuiltinExec(JSThread *thread, const JSHandle } JSHandle iValue(thread, capturedValue); JSObject::CreateDataProperty(thread, results, i, iValue); + if (!groupName->IsUndefined()) { + JSHandle groupObject = JSHandle::Cast(groups); + TaggedArray *groupArray = TaggedArray::Cast(regexpObj->GetGroupName().GetTaggedObject()); + if (groupArray->GetLength() > i - 1) { + JSHandle skey(thread, groupArray->Get(i - 1)); + JSObject::CreateDataProperty(thread, groupObject, skey, iValue); + } + } } if (lastIndex == 0 && useCache) { RegExpExecResultCache::AddResultInCache(thread, cacheTable, pattern, flags, inputStr, @@ -1672,6 +1704,15 @@ JSTaggedValue BuiltinsRegExp::RegExpInitialize(JSThread *thread, const JSHandle< regexp->SetOriginalSource(thread, patternStrHandle.GetTaggedValue()); // 12. Set the value of obj’s [[OriginalFlags]] internal slot to F. regexp->SetOriginalFlags(thread, JSTaggedValue(flagsBits)); + auto groupName = parser.GetGroupNames(); + if (!groupName.empty()) { + JSHandle taggedArray = factory->NewTaggedArray(groupName.size()); + for (size_t i = 0; i < groupName.size(); ++i) { + JSHandle flagsKey(factory->NewFromStdString(groupName[i].c_str())); + taggedArray->Set(thread, i, flagsKey); + } + regexp->SetGroupName(thread, taggedArray); + } // 13. Set obj’s [[RegExpMatcher]] internal slot. if (getCache.first == JSTaggedValue::Hole()) { auto bufferSize = parser.GetOriginBufferSize(); diff --git a/ecmascript/builtins/builtins_string.cpp b/ecmascript/builtins/builtins_string.cpp index 748327ad5b..9933bdce2b 100644 --- a/ecmascript/builtins/builtins_string.cpp +++ b/ecmascript/builtins/builtins_string.cpp @@ -662,8 +662,8 @@ JSTaggedValue BuiltinsString::MatchAll(EcmaRuntimeCallInfo *argv) // b. If isRegExp is true, then if (isJSRegExp) { // i. Let flags be ? Get(searchValue, "flags"). - JSHandle flagsKey(factory->NewFromASCII("flags")); - JSHandle flags = JSObject::GetProperty(thread, regexp, flagsKey).GetValue(); + JSHandle flagsString(globalConst->GetHandledFlagsString()); + JSHandle flags = JSObject::GetProperty(thread, regexp, flagsString).GetValue(); RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); // ii. Perform ? RequireObjectCoercible(flags). JSTaggedValue::RequireObjectCoercible(thread, flags); @@ -992,13 +992,13 @@ JSTaggedValue BuiltinsString::Replace(EcmaRuntimeCallInfo *argv) if (pos == -1) { return thisString.GetTaggedValue(); } - + JSHandle undefined = globalConst->GetHandledUndefined(); JSMutableHandle replHandle(thread, factory->GetEmptyString().GetTaggedValue()); // If functionalReplace is true, then if (replaceTag->IsCallable()) { // Let replValue be Call(replaceValue, undefined,«matched, pos, and string»). const size_t argsLength = 3; // 3: «matched, pos, and string» - JSHandle undefined = globalConst->GetHandledUndefined(); + EcmaRuntimeCallInfo info = EcmaInterpreter::NewRuntimeCallInfo(thread, replaceTag, undefined, undefined, argsLength); info.SetCallArg(searchString.GetTaggedValue(), JSTaggedValue(pos), thisString.GetTaggedValue()); @@ -1006,17 +1006,19 @@ JSTaggedValue BuiltinsString::Replace(EcmaRuntimeCallInfo *argv) replHandle.Update(replStrDeocodeValue); } else { // Let captures be an empty List. - JSHandle capturesList = factory->NewTaggedArray(0); + JSHandle capturesList = factory->EmptyArray(); ASSERT_PRINT(replaceTag->IsString(), "replace must be string"); JSHandle replacement(thread, replaceTag->GetTaggedObject()); // Let replStr be GetSubstitution(matched, string, pos, captures, replaceValue) - replHandle.Update(GetSubstitution(thread, searchString, thisString, pos, capturesList, replacement)); + replHandle.Update(GetSubstitution(thread, searchString, thisString, pos, capturesList, undefined, replacement)); } JSHandle realReplaceStr = JSTaggedValue::ToString(thread, replHandle); // Let tailPos be pos + the number of code units in matched. int32_t tailPos = pos + static_cast(searchString->GetLength()); - // Let newString be the String formed by concatenating the first pos code units of string, replStr, and the trailing - // substring of string starting at index tailPos. If pos is 0, the first element of the concatenation will be the + // Let newString be the String formed by concatenating the first pos code units of string, + // replStr, and the trailing + // substring of string starting at index tailPos. If pos is 0, + // the first element of the concatenation will be the // empty String. // Return newString. JSHandle prefixString(thread, EcmaString::FastSubString(thisString, 0, pos, ecmaVm)); @@ -1057,9 +1059,169 @@ JSTaggedValue BuiltinsString::Replace(EcmaRuntimeCallInfo *argv) factory->NewFromUtf16LiteralNotCompress(uint16tData, stringBuilder.size()).GetTaggedValue(); } +JSTaggedValue BuiltinsString::ReplaceAll(EcmaRuntimeCallInfo *argv) +{ + ASSERT(argv); + JSThread *thread = argv->GetThread(); + BUILTINS_API_TRACE(thread, String, ReplaceAll); + [[maybe_unused]] EcmaHandleScope handleScope(thread); + JSHandle thisTag = JSTaggedValue::RequireObjectCoercible(thread, BuiltinsString::GetThis(argv)); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + + auto ecmaVm = thread->GetEcmaVM(); + JSHandle env = ecmaVm->GetGlobalEnv(); + const GlobalEnvConstants *globalConst = thread->GlobalConstants(); + JSHandle searchTag = BuiltinsString::GetCallArg(argv, 0); + JSHandle replaceTag = BuiltinsString::GetCallArg(argv, 1); + + ObjectFactory *factory = ecmaVm->GetFactory(); + + if (!searchTag->IsUndefined() && !searchTag->IsNull()) { + // a. Let isRegExp be ? IsRegExp(searchValue). + bool isJSRegExp = JSObject::IsRegExp(thread, searchTag); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + // b. If isRegExp is true, then + if (isJSRegExp) { + // i. Let flags be ? Get(searchValue, "flags"). + JSHandle flagsString(globalConst->GetHandledFlagsString()); + JSHandle flags = JSObject::GetProperty(thread, searchTag, flagsString).GetValue(); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + // ii. Perform ? RequireObjectCoercible(flags). + JSTaggedValue::RequireObjectCoercible(thread, flags); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + // iii. If ? ToString(flags) does not contain "g", throw a TypeError exception. + JSHandle flagString = JSTaggedValue::ToString(thread, flags); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + JSHandle gString(globalConst->GetHandledGString()); + int32_t pos = flagString->IndexOf(*gString); + if (pos == -1) { + THROW_TYPE_ERROR_AND_RETURN(thread, + "string.prototype.replaceAll called with a non-global RegExp argument", + JSTaggedValue::Exception()); + } + } + // c. Let replacer be ? GetMethod(searchValue, @@replace). + JSHandle replaceKey = env->GetReplaceSymbol(); + JSHandle replaceMethod = JSObject::GetMethod(thread, searchTag, replaceKey); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + // d. If replacer is not undefined, then + if (!replaceMethod->IsUndefined()) { + // i. Return ? Call(replacer, searchValue, «O, replaceValue»). + const size_t argsLength = 2; + JSHandle undefined = globalConst->GetHandledUndefined(); + EcmaRuntimeCallInfo info = + EcmaInterpreter::NewRuntimeCallInfo(thread, replaceMethod, searchTag, undefined, argsLength); + info.SetCallArg(thisTag.GetTaggedValue(), replaceTag.GetTaggedValue()); + return JSFunction::Call(&info); + } + } + + // 3. Let string be ? ToString(O). + JSHandle thisString = JSTaggedValue::ToString(thread, thisTag); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + // 4. Let searchString be ? ToString(searchValue). + JSHandle searchString = JSTaggedValue::ToString(thread, searchTag); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + // 5. Let functionalReplace be IsCallable(replaceValue). + // 6. If functionalReplace is false, then + if (!replaceTag->IsCallable()) { + // a. Set replaceValue to ? ToString(replaceValue). + replaceTag = JSHandle(JSTaggedValue::ToString(thread, replaceTag)); + RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); + } + + // 7. Let searchLength be the length of searchString. + // 8. Let advanceBy be max(1, searchLength). + int32_t searchLength = searchString->GetLength(); + int32_t advanceBy = std::max(1, searchLength); + // 9. Let matchPositions be a new empty List. + std::u16string stringBuilder; + std::u16string stringPrefixString; + std::u16string stringRealReplaceStr; + std::u16string stringSuffixString; + // 10. Let position be ! StringIndexOf(string, searchString, 0). + int32_t pos = thisString->IndexOf(*searchString); + int32_t endOfLastMatch = 0; + bool canBeCompress = true; + JSHandle undefined = globalConst->GetHandledUndefined(); + JSMutableHandle replHandle(thread, factory->GetEmptyString().GetTaggedValue()); + while (pos != -1) { + // If functionalReplace is true, then + if (replaceTag->IsCallable()) { + // Let replValue be Call(replaceValue, undefined,«matched, pos, and string»). + const size_t argsLength = 3; // 3: «matched, pos, and string» + + EcmaRuntimeCallInfo info = + EcmaInterpreter::NewRuntimeCallInfo(thread, replaceTag, undefined, undefined, argsLength); + info.SetCallArg(searchString.GetTaggedValue(), JSTaggedValue(pos), thisString.GetTaggedValue()); + JSTaggedValue replStrDeocodeValue = JSFunction::Call(&info); + replHandle.Update(replStrDeocodeValue); + } else { + // Let captures be an empty List. + JSHandle capturesList = factory->NewTaggedArray(0); + ASSERT_PRINT(replaceTag->IsString(), "replace must be string"); + JSHandle replacement(thread, replaceTag->GetTaggedObject()); + // Let replStr be GetSubstitution(matched, string, pos, captures, replaceValue) + replHandle.Update(GetSubstitution(thread, searchString, thisString, pos, + capturesList, undefined, replacement)); + } + JSHandle realReplaceStr = JSTaggedValue::ToString(thread, replHandle); + // Let tailPos be pos + the number of code units in matched. + // Let newString be the String formed by concatenating the first pos code units of string, + // replStr, and the trailing substring of string starting at index tailPos. + // If pos is 0, the first element of the concatenation will be the + // empty String. + // Return newString. + JSHandle prefixString(thread, + EcmaString::FastSubString(thisString, endOfLastMatch, + pos - endOfLastMatch, ecmaVm)); + if (prefixString->IsUtf16()) { + const uint16_t *data = prefixString->GetDataUtf16(); + stringPrefixString = base::StringHelper::Utf16ToU16String(data, prefixString->GetLength()); + canBeCompress = false; + } else { + const uint8_t *data = prefixString->GetDataUtf8(); + stringPrefixString = base::StringHelper::Utf8ToU16String(data, prefixString->GetLength()); + } + if (realReplaceStr->IsUtf16()) { + const uint16_t *data = realReplaceStr->GetDataUtf16(); + stringRealReplaceStr = base::StringHelper::Utf16ToU16String(data, realReplaceStr->GetLength()); + canBeCompress = false; + } else { + const uint8_t *data = realReplaceStr->GetDataUtf8(); + stringRealReplaceStr = base::StringHelper::Utf8ToU16String(data, realReplaceStr->GetLength()); + } + stringBuilder = stringBuilder + stringPrefixString + stringRealReplaceStr; + endOfLastMatch = pos + searchLength; + pos = thisString->IndexOf(*searchString, pos + advanceBy); + } + + if (endOfLastMatch < static_cast(thisString->GetLength())) { + JSHandle suffixString(thread, + EcmaString::FastSubString(thisString, endOfLastMatch, + thisString->GetLength() - endOfLastMatch, ecmaVm)); + if (suffixString->IsUtf16()) { + const uint16_t *data = suffixString->GetDataUtf16(); + stringSuffixString = base::StringHelper::Utf16ToU16String(data, suffixString->GetLength()); + canBeCompress = false; + } else { + const uint8_t *data = suffixString->GetDataUtf8(); + stringSuffixString = base::StringHelper::Utf8ToU16String(data, suffixString->GetLength()); + } + stringBuilder = stringBuilder + stringSuffixString; + } + + auto *char16tData = const_cast(stringBuilder.c_str()); + auto *uint16tData = reinterpret_cast(char16tData); + return canBeCompress ? + factory->NewFromUtf16LiteralCompress(uint16tData, stringBuilder.length()).GetTaggedValue() : + factory->NewFromUtf16LiteralNotCompress(uint16tData, stringBuilder.length()).GetTaggedValue(); +} + JSTaggedValue BuiltinsString::GetSubstitution(JSThread *thread, const JSHandle &matched, const JSHandle &srcString, int position, const JSHandle &captureList, + const JSHandle &namedCaptures, const JSHandle &replacement) { BUILTINS_API_TRACE(thread, String, GetSubstitution); @@ -1073,7 +1235,6 @@ JSTaggedValue BuiltinsString::GetSubstitution(JSThread *thread, const JSHandle 0) { @@ -1099,6 +1260,7 @@ JSTaggedValue BuiltinsString::GetSubstitution(JSThread *thread, const JSHandleAt(peekIndex); + int32_t p = 0; switch (peek) { case '$': // $$ stringBuilder += '$'; @@ -1193,6 +1355,42 @@ JSTaggedValue BuiltinsString::GetSubstitution(JSThread *thread, const JSHandleIsUndefined()) { + stringBuilder += '$'; + continueFromIndex = peekIndex; + break; + } + JSHandle greaterSymString = factory->NewFromASCII(">"); + int pos = replacement->IndexOf(*greaterSymString, peekIndex); + if (pos == -1) { + stringBuilder += '$'; + continueFromIndex = peekIndex; + break; + } + JSHandle groupName(thread, + EcmaString::FastSubString(replacement, + peekIndex + 1, pos - peekIndex - 1, ecmaVm)); + JSHandle names(groupName); + JSHandle capture = JSObject::GetProperty(thread, namedCaptures, names).GetValue(); + if (capture->IsUndefined()) { + continueFromIndex = pos + 1; + p = pos; + break; + } + JSHandle captureName(capture); + if (captureName->IsUtf16()) { + const uint16_t *data = captureName->GetDataUtf16(); + stringBuilder += base::StringHelper::Utf16ToU16String(data, captureName->GetLength()); + canBeCompress = false; + } else { + const uint8_t *data = captureName->GetDataUtf8(); + stringBuilder += base::StringHelper::Utf8ToU16String(data, captureName->GetLength()); + } + continueFromIndex = pos + 1; + p = pos; + break; + } default: stringBuilder += '$'; continueFromIndex = peekIndex; diff --git a/ecmascript/builtins/builtins_string.h b/ecmascript/builtins/builtins_string.h index 34c82e149e..80489a25df 100644 --- a/ecmascript/builtins/builtins_string.h +++ b/ecmascript/builtins/builtins_string.h @@ -43,6 +43,7 @@ public: static JSTaggedValue GetSubstitution(JSThread *thread, const JSHandle &matched, const JSHandle &srcString, int position, const JSHandle &captureList, + const JSHandle &namedCaptures, const JSHandle &replacement); // 21.1.3.1 static JSTaggedValue CharAt(EcmaRuntimeCallInfo *argv); @@ -78,6 +79,7 @@ public: // 21.1.3.14 static JSTaggedValue Replace(EcmaRuntimeCallInfo *argv); // 21.1.3.14.1 Runtime Semantics: GetSubstitution() + static JSTaggedValue ReplaceAll(EcmaRuntimeCallInfo *argv); // 21.1.3.15 static JSTaggedValue Search(EcmaRuntimeCallInfo *argv); // 21.1.3.16 diff --git a/ecmascript/dump.cpp b/ecmascript/dump.cpp index 955705616a..40f99dcbc7 100644 --- a/ecmascript/dump.cpp +++ b/ecmascript/dump.cpp @@ -1932,13 +1932,16 @@ void JSRegExp::Dump(std::ostream &os) const { os << "\n"; os << " - ByteCodeBuffer: "; - GetByteCodeBuffer().Dump(os); + GetByteCodeBuffer().D(); os << "\n"; os << " - OriginalSource: "; - GetOriginalSource().Dump(os); + GetOriginalSource().D(); os << "\n"; os << " - OriginalFlags: "; - GetOriginalFlags().Dump(os); + GetOriginalFlags().D(); + os << "\n"; + os << " - GroupName: "; + GetGroupName().D(); os << "\n"; os << " - Length: " << GetLength(); os << "\n"; @@ -4086,7 +4089,7 @@ void JSRegExp::DumpForSnapshot(std::vector> &v { vec.push_back(std::make_pair(CString("originalSource"), GetOriginalSource())); vec.push_back(std::make_pair(CString("originalFlags"), GetOriginalFlags())); - + vec.push_back(std::make_pair(CString("groupName"), GetGroupName())); JSObject::DumpForSnapshot(vec); } diff --git a/ecmascript/ecma_string.cpp b/ecmascript/ecma_string.cpp index f6e71e533c..7fb6ccbf50 100644 --- a/ecmascript/ecma_string.cpp +++ b/ecmascript/ecma_string.cpp @@ -179,12 +179,13 @@ int32_t EcmaString::IndexOf(const EcmaString *rhs, int32_t pos) const const EcmaString *lhs = this; int32_t lhsCount = static_cast(lhs->GetLength()); int32_t rhsCount = static_cast(rhs->GetLength()); - if (rhsCount == 0) { - return pos; + + if (pos > lhsCount) { + return -1; } - if (pos >= lhsCount) { - return -1; + if (rhsCount == 0) { + return pos; } if (pos < 0) { diff --git a/ecmascript/global_env_constants.cpp b/ecmascript/global_env_constants.cpp index ee61102144..50df476a8b 100644 --- a/ecmascript/global_env_constants.cpp +++ b/ecmascript/global_env_constants.cpp @@ -407,6 +407,7 @@ void GlobalEnvConstants::InitGlobalConstant(JSThread *thread) SetConstant(ConstantIndex::FRACTION_STRING_INDEX, factory->NewFromASCIINonMovable("fraction")); SetConstant(ConstantIndex::DECIMAL_STRING_INDEX, factory->NewFromASCIINonMovable("decimal")); SetConstant(ConstantIndex::GROUP_STRING_INDEX, factory->NewFromASCIINonMovable("group")); + SetConstant(ConstantIndex::GROUPS_STRING_INDEX, factory->NewFromASCIINonMovable("groups")); SetConstant(ConstantIndex::CURRENCY_STRING_INDEX, factory->NewFromASCIINonMovable("currency")); SetConstant(ConstantIndex::CURRENCY_SIGN_STRING_INDEX, factory->NewFromASCIINonMovable("currencySign")); SetConstant(ConstantIndex::CURRENCY_DISPLAY_STRING_INDEX, factory->NewFromASCIINonMovable("currencyDisplay")); diff --git a/ecmascript/global_env_constants.h b/ecmascript/global_env_constants.h index 8cfa6cc553..bbff81c96c 100644 --- a/ecmascript/global_env_constants.h +++ b/ecmascript/global_env_constants.h @@ -280,6 +280,7 @@ class JSThread; V(JSTaggedValue, FractionString, FRACTION_STRING_INDEX, fraction) \ V(JSTaggedValue, DecimalString, DECIMAL_STRING_INDEX, decimal) \ V(JSTaggedValue, GroupString, GROUP_STRING_INDEX, group) \ + V(JSTaggedValue, GroupsString, GROUPS_STRING_INDEX, groups) \ V(JSTaggedValue, CurrencyString, CURRENCY_STRING_INDEX, currency) \ V(JSTaggedValue, CurrencySignString, CURRENCY_SIGN_STRING_INDEX, currencySign) \ V(JSTaggedValue, CurrencyDisplayString, CURRENCY_DISPLAY_STRING_INDEX, currencyDisplay) \ diff --git a/ecmascript/js_regexp.h b/ecmascript/js_regexp.h index b2452ca596..1cce45515f 100644 --- a/ecmascript/js_regexp.h +++ b/ecmascript/js_regexp.h @@ -29,7 +29,8 @@ public: static constexpr size_t REGEXP_BYTE_CODE_OFFSET = JSObject::SIZE; ACCESSORS(ByteCodeBuffer, REGEXP_BYTE_CODE_OFFSET, ORIGINAL_SOURCE_OFFSET) ACCESSORS(OriginalSource, ORIGINAL_SOURCE_OFFSET, ORIGINAL_FLAGS_OFFSET) - ACCESSORS(OriginalFlags, ORIGINAL_FLAGS_OFFSET, LENGTH_OFFSET) + ACCESSORS(OriginalFlags, ORIGINAL_FLAGS_OFFSET, GROUP_NAME_OFFSET) + ACCESSORS(GroupName, GROUP_NAME_OFFSET, LENGTH_OFFSET) ACCESSORS_PRIMITIVE_FIELD(Length, uint32_t, LENGTH_OFFSET, LAST_OFFSET) DEFINE_ALIGN_SIZE(LAST_OFFSET); diff --git a/ecmascript/object_factory.cpp b/ecmascript/object_factory.cpp index fd709bc5d3..e4f6443ae3 100644 --- a/ecmascript/object_factory.cpp +++ b/ecmascript/object_factory.cpp @@ -1055,6 +1055,7 @@ void ObjectFactory::InitializeJSObject(const JSHandle &obj, const JSHa JSRegExp::Cast(*obj)->SetByteCodeBuffer(thread_, JSTaggedValue::Undefined()); JSRegExp::Cast(*obj)->SetOriginalSource(thread_, JSTaggedValue::Undefined()); JSRegExp::Cast(*obj)->SetOriginalFlags(thread_, JSTaggedValue(0)); + JSRegExp::Cast(*obj)->SetGroupName(thread_, JSTaggedValue::Undefined()); JSRegExp::Cast(*obj)->SetLength(0); break; case JSType::JS_PRIMITIVE_REF: diff --git a/ecmascript/regexp/regexp_opcode.h b/ecmascript/regexp/regexp_opcode.h index 9fdc5aa102..a9c8500daa 100644 --- a/ecmascript/regexp/regexp_opcode.h +++ b/ecmascript/regexp/regexp_opcode.h @@ -344,7 +344,6 @@ public: } return false; } - inline uint32_t HighestValue() const { if (!rangeSet_.empty()) { @@ -352,7 +351,6 @@ public: } return 0; } - RangeSet(RangeSet const &) = default; RangeSet &operator=(RangeSet const &) = default; RangeSet(RangeSet &&) = default; diff --git a/ecmascript/regexp/regexp_parser.cpp b/ecmascript/regexp/regexp_parser.cpp index 8e481adacc..bbe85060b5 100644 --- a/ecmascript/regexp/regexp_parser.cpp +++ b/ecmascript/regexp/regexp_parser.cpp @@ -21,10 +21,15 @@ #include "libpandabase/utils/utils.h" #include "securec.h" #include "unicode/uniset.h" - +#include "third_party/icu/icu4c/source/common/unicode/uchar.h" #define _NO_DEBUG_ namespace panda::ecmascript { +static constexpr uint32_t CACHE_SIZE = 128; +static constexpr uint32_t ID_START_TABLE_ASCII[4] = { + /* $ A-Z _ a-z */ + 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE +}; static RangeSet g_rangeD(0x30, 0x39); // NOLINTNEXTLINE(fuchsia-statically-constructed-objects) // NOLINTNEXTLINE(fuchsia-statically-constructed-objects) static RangeSet g_rangeS({ @@ -539,6 +544,7 @@ bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward) return false; } groupNames_.EmitStr(name.c_str()); + newGroupNames_.push_back(name); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) PrintF("group name %s", name.c_str()); Advance(); @@ -758,24 +764,42 @@ void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int cap bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name) { const uint8_t *p = *pp; - int c = *p; - while (c != '>') { - if (c < (INT8_MAX + 1)) { - if (name.empty()) { - if (!g_regexpIdentifyStart.IsContain(c)) { - return false; - } - } else { - if (!g_regexpIdentifyContinue.IsContain(c)) { - return false; - } + uint32_t c ; + char buffer[CACHE_SIZE] = {0}; + char *q = buffer; + while (true) { + c = *p; + if (c == '\\') { + p++; + if (*p != 'u') { + return false; } - name += static_cast(c); + if (!ParseUnicodeEscape(&c)) { + return false; + } + } else if (c == '>') { + break; + } else if (c > CACHE_SIZE) { + c = base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p); + } else { + p++; } - c = *++p; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - } - p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + if (q == buffer) { + if (!IsIdentFirst(c)) { + return false; + } + } else { + if (!u_isIDPart(c)) { + return false; + } + } + if (q != nullptr) { + *q++ = c; + } + } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + p++; *pp = p; + name = buffer; return true; } @@ -784,6 +808,7 @@ int RegExpParser::ParseCaptureCount(const char *groupName) const uint8_t *p; int captureIndex = 1; CString name; + hasNamedCaptures_ = 0; for (p = base_; p < end_; p++) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) switch (*p) { case '(': { @@ -793,6 +818,7 @@ int RegExpParser::ParseCaptureCount(const char *groupName) // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) p[CAPTURE_CONUT_ADVANCE] != '=') { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + hasNamedCaptures_ = 1; p += CAPTURE_CONUT_ADVANCE; if (groupName != nullptr) { if (ParseGroupSpecifier(&p, name)) { @@ -836,6 +862,7 @@ int RegExpParser::ParseAtomEscape(bool isBackward) int result = -1; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) PrintF("Parse AtomEscape------\n"); + PrevOpCode prevOp; switch (c0_) { case KEY_EOF: // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) @@ -870,50 +897,108 @@ int RegExpParser::ParseAtomEscape(bool isBackward) case 'd': { // [0-9] RangeOpCode rangeOp; + if (isBackward) { + prevOp.EmitOpCode(&buffer_, 0); + } rangeOp.InsertOpCode(&buffer_, g_rangeD); - Advance(); + goto parseLookBehind; } break; case 'D': { // [^0-9] RangeSet atomRange(g_rangeD); atomRange.Invert(IsUtf16()); Range32OpCode rangeOp; + if (isBackward) { + prevOp.EmitOpCode(&buffer_, 0); + } rangeOp.InsertOpCode(&buffer_, atomRange); - Advance(); + goto parseLookBehind; } break; case 's': { // [\f\n\r\t\v] RangeOpCode rangeOp; + if (isBackward) { + prevOp.EmitOpCode(&buffer_, 0); + } rangeOp.InsertOpCode(&buffer_, g_rangeS); - Advance(); + goto parseLookBehind; } break; case 'S': { RangeSet atomRange(g_rangeS); - atomRange.Invert(IsUtf16()); Range32OpCode rangeOp; + atomRange.Invert(IsUtf16()); + if (isBackward) { + prevOp.EmitOpCode(&buffer_, 0); + } rangeOp.InsertOpCode(&buffer_, atomRange); - Advance(); + goto parseLookBehind; } break; case 'w': { // [A-Za-z0-9] RangeOpCode rangeOp; + if (isBackward) { + prevOp.EmitOpCode(&buffer_, 0); + } rangeOp.InsertOpCode(&buffer_, g_rangeW); - Advance(); + goto parseLookBehind; } break; case 'W': { // [^A-Za-z0-9] RangeSet atomRange(g_rangeW); atomRange.Invert(IsUtf16()); Range32OpCode rangeOp; + if (isBackward) { + prevOp.EmitOpCode(&buffer_, 0); + } rangeOp.InsertOpCode(&buffer_, atomRange); - Advance(); + goto parseLookBehind; } break; // P{UnicodePropertyValueExpression} // p{UnicodePropertyValueExpression} case 'P': case 'p': // [+N]kGroupName[?U] - case 'k': + case 'k': { + Advance(); + if (c0_ != '<') { + if (!IsUtf16() || HasNamedCaptures()) { + ParseError("expecting group name."); + break; + } + } + Advance(); + Prev(); + CString name; + auto **pp = const_cast(&pc_); + if (!ParseGroupSpecifier(pp, name)) { + ParseError("GroupName Syntax error."); + break; + } + int postion = FindGroupName(name); + if (postion < 0) { + postion = ParseCaptureCount(name.c_str()); + if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) { + ParseError("group name not defined"); + break; + } + } + if (isBackward) { + BackwardBackReferenceOpCode backReferenceOp; + backReferenceOp.EmitOpCode(&buffer_, postion); + } else { + BackReferenceOpCode backReferenceOp; + backReferenceOp.EmitOpCode(&buffer_, postion); + } + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) + Advance(); + } break; + parseLookBehind: { + if (isBackward) { + prevOp.EmitOpCode(&buffer_, 0); + } + Advance(); + break; + } default: result = ParseCharacterEscape(); break; @@ -921,6 +1006,22 @@ int RegExpParser::ParseAtomEscape(bool isBackward) return result; } +int RegExpParser::RecountCaptures() +{ + if (totalCaptureCount_ < 0) { + const char *name = reinterpret_cast(groupNames_.buf_); + totalCaptureCount_ = ParseCaptureCount(name); + } + return totalCaptureCount_; +} +bool RegExpParser::HasNamedCaptures() +{ + if (hasNamedCaptures_ < 0) { + RecountCaptures(); + } + return false; +} + int RegExpParser::ParseCharacterEscape() { // CharacterEscape[U]:: @@ -1304,4 +1405,13 @@ void RegExpParser::ParseError(const char *errorMessage) UNREACHABLE(); } } -} // namespace panda::ecmascript + +int RegExpParser::IsIdentFirst(uint32_t c) +{ + if (c < CACHE_SIZE) { + return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31 + } else { + return u_isIDStart(c); + } +} +} // namespace panda::ecmascript \ No newline at end of file diff --git a/ecmascript/regexp/regexp_parser.h b/ecmascript/regexp/regexp_parser.h index 278c923dae..3d974d2e54 100644 --- a/ecmascript/regexp/regexp_parser.h +++ b/ecmascript/regexp/regexp_parser.h @@ -28,6 +28,7 @@ #include "unicode/utf16.h" #include "unicode/utf8.h" #include "unicode/utypes.h" +#include "unicode/udata.h" namespace panda::ecmascript { class RegExpParser { @@ -51,6 +52,7 @@ public: static constexpr uint32_t UNICODE_HEX_VALUE = 4; static constexpr uint32_t UNICODE_HEX_ADVANCE = 2; static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3; + static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6; explicit RegExpParser(Chunk *chunk) : base_(nullptr), @@ -105,7 +107,21 @@ public: bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value); bool ParseUnicodeEscape(uint32_t *value); bool ParserIntervalQuantifier(int *pmin, int *pmax); + bool HasNamedCaptures(); + int ParseEscape(const uint8_t **pp, int isUtf16); + int RecountCaptures(); + int IsIdentFirst(uint32_t c); + inline std::vector GetGroupNames() const + { + return newGroupNames_; + } + + inline size_t GetGroupNamesSize() const + { + return groupNames_.size_ ; + } + inline bool IsError() const { return isError_; @@ -227,8 +243,11 @@ private: int stackCount_; bool isError_; char errorMsg_[TMP_BUF_SIZE] = {0}; // NOLINTNEXTLINE(modernize-avoid-c-arrays) + int hasNamedCaptures_ = -1; + int totalCaptureCount_ = -1; DynChunk buffer_; DynChunk groupNames_; + std::vector newGroupNames_; }; } // namespace panda::ecmascript #endif // ECMASCRIPT_REGEXP_PARSER_H diff --git a/ecmascript/runtime_call_id.h b/ecmascript/runtime_call_id.h index d1f02ae14e..74a770b2dd 100644 --- a/ecmascript/runtime_call_id.h +++ b/ecmascript/runtime_call_id.h @@ -505,6 +505,7 @@ namespace panda::ecmascript { V(String, PadEnd) \ V(String, Repeat) \ V(String, Replace) \ + V(String, ReplaceAll) \ V(String, Search) \ V(String, Slice) \ V(String, Split) \ diff --git a/ecmascript/snapshot/mem/snapshot_processor.cpp b/ecmascript/snapshot/mem/snapshot_processor.cpp index cc14e8b128..d6d15b57e8 100644 --- a/ecmascript/snapshot/mem/snapshot_processor.cpp +++ b/ecmascript/snapshot/mem/snapshot_processor.cpp @@ -472,6 +472,7 @@ static uintptr_t g_nativeTable[] = { reinterpret_cast(BuiltinsString::PadStart), reinterpret_cast(BuiltinsString::Repeat), reinterpret_cast(BuiltinsString::Replace), + reinterpret_cast(BuiltinsString::ReplaceAll), reinterpret_cast(BuiltinsString::Search), reinterpret_cast(BuiltinsString::Slice), reinterpret_cast(BuiltinsString::Split), diff --git a/ecmascript/tests/dump_test.cpp b/ecmascript/tests/dump_test.cpp index 15f8b29578..7278afea3c 100644 --- a/ecmascript/tests/dump_test.cpp +++ b/ecmascript/tests/dump_test.cpp @@ -275,6 +275,7 @@ static JSHandle NewJSRegExp(JSThread *thread, ObjectFactory *factory, JSHandle jSRegExp = JSHandle::Cast(factory->NewJSObject(jSRegExpClass)); jSRegExp->SetByteCodeBuffer(thread, JSTaggedValue::Undefined()); jSRegExp->SetOriginalSource(thread, JSTaggedValue::Undefined()); + jSRegExp->SetGroupName(thread, JSTaggedValue::Undefined()); jSRegExp->SetOriginalFlags(thread, JSTaggedValue(0)); jSRegExp->SetLength(0); return jSRegExp; @@ -500,7 +501,7 @@ HWTEST_F_L0(EcmaDumpTest, HeapProfileDump) break; } case JSType::JS_REG_EXP: { - CHECK_DUMP_FIELDS(JSObject::SIZE, JSRegExp::SIZE, 4U) + CHECK_DUMP_FIELDS(JSObject::SIZE, JSRegExp::SIZE, 5U) NEW_OBJECT_AND_DUMP(JSRegExp, JS_REG_EXP) break; }