optimize regexp replace internal string concat

Issue: https://gitee.com/open_harmony/dashboard?issue_id=I9GE8E

Signed-off-by: chenjx-huawei <chenjingxiang1@huawei.com>
Change-Id: I94e2be5fc98d98e7346f54845359ed4d667e6c6e
This commit is contained in:
chenjx-huawei 2024-04-15 09:48:19 +08:00
parent e9edd355d5
commit d5086b5c96
8 changed files with 256 additions and 24 deletions

View File

@ -1082,7 +1082,11 @@ JSTaggedValue BuiltinsRegExp::ReplaceInternal(JSThread *thread,
}
}
// 14. Let accumulatedResult be the empty String value.
JSMutableHandle<EcmaString> accumulatedResult(thread, factory->GetEmptyString());
bool isUtf8 = EcmaStringAccessor(srcString).IsUtf8();
uint32_t resultStrLength = 0;
uint32_t resultArrayLength = (resultsIndex + 1) * 2;
JSHandle<TaggedArray> resultArray = factory->NewTaggedArray(resultArrayLength);
std::vector<uint64_t> resultLengthArray(resultArrayLength);
// 15. Let nextSourcePosition be 0.
uint32_t nextSourcePosition = 0;
JSMutableHandle<JSTaggedValue> getMatchString(thread, JSTaggedValue::Undefined());
@ -1214,39 +1218,49 @@ JSTaggedValue BuiltinsRegExp::ReplaceInternal(JSThread *thread,
// ii. Let accumulatedResult be the String formed by concatenating the code units of the current value
// of accumulatedResult with the substring of S consisting of the code units from nextSourcePosition
// (inclusive) up to position (exclusive) and with the code units of replacement.
auto substr = EcmaStringAccessor::FastSubString(thread->GetEcmaVM(),
JSHandle<EcmaString>::Cast(inputStr), nextSourcePosition, position - nextSourcePosition);
accumulatedResult.Update(JSHandle<EcmaString>(thread, EcmaStringAccessor::Concat(thread->GetEcmaVM(),
accumulatedResult, JSHandle<EcmaString>(thread, substr))));
accumulatedResult.Update(JSHandle<EcmaString>(thread, EcmaStringAccessor::Concat(thread->GetEcmaVM(),
accumulatedResult, replacementString)));
// store undefined in resultArray
resultArray->Set(thread, REPLACE_RESULT_VAL * i, JSTaggedValue::Undefined());
uint64_t bits = 0;
bits |= ReplaceLengthField::Encode(position - nextSourcePosition);
bits |= ReplacePositionField::Encode(nextSourcePosition);
// store position and length bits in resultLengthArray
resultLengthArray[REPLACE_RESULT_VAL * i] = bits;
resultStrLength += (position - nextSourcePosition);
// store replacement string in resultArray
resultArray->Set(thread, REPLACE_RESULT_VAL * i + 1, replacementString.GetTaggedValue());
uint32_t replacementLength = EcmaStringAccessor(replacementString).GetLength();
// store length of replacement string in resultLengthArray
resultLengthArray[REPLACE_RESULT_VAL * i + 1] = static_cast<uint64_t>(replacementLength);
resultStrLength += replacementLength;
isUtf8 &= EcmaStringAccessor(replacementString).IsUtf8();
// iii. Let nextSourcePosition be position + matchLength.
nextSourcePosition = position + matchLength;
}
}
// 17. If nextSourcePosition ≥ lengthS, return accumulatedResult.
if (nextSourcePosition >= length) {
if (useCache) {
RegExpExecResultCache::AddResultInCache(thread, cacheTable, thisObj, string,
JSHandle<JSTaggedValue>(accumulatedResult),
RegExpExecResultCache::REPLACE_TYPE, 0, nextIndexHandle->GetInt(),
inputReplaceValue.GetTaggedValue());
}
return accumulatedResult.GetTaggedValue();
if (nextSourcePosition < length) {
// store undefined in resultArray
resultArray->Set(thread, REPLACE_RESULT_VAL * resultsIndex, JSTaggedValue::Undefined());
uint64_t bits = 0;
bits |= ReplaceLengthField::Encode(length - nextSourcePosition);
bits |= ReplacePositionField::Encode(nextSourcePosition);
// store position and length bits in resultLengthArray
resultLengthArray[REPLACE_RESULT_VAL * resultsIndex] = bits;
resultStrLength += (length - nextSourcePosition);
}
JSHandle<EcmaString> result =
CreateStringFromResultArray(thread, resultArray, resultLengthArray, srcString, resultStrLength, isUtf8);
// 18. Return the String formed by concatenating the code units of accumulatedResult with the substring of S
// consisting of the code units from nextSourcePosition (inclusive) up through the final code unit of S(inclusive).
auto substr = EcmaStringAccessor::FastSubString(thread->GetEcmaVM(),
JSHandle<EcmaString>::Cast(inputStr), nextSourcePosition, length - nextSourcePosition);
accumulatedResult.Update(JSHandle<EcmaString>(thread, EcmaStringAccessor::Concat(thread->GetEcmaVM(),
accumulatedResult, JSHandle<EcmaString>(thread, substr))));
if (useCache) {
RegExpExecResultCache::AddResultInCache(thread, cacheTable, thisObj, string,
JSHandle<JSTaggedValue>(accumulatedResult),
JSHandle<JSTaggedValue>(result),
RegExpExecResultCache::REPLACE_TYPE, 0, nextIndexHandle->GetInt(),
inputReplaceValue.GetTaggedValue());
}
return accumulatedResult.GetTaggedValue();
return result.GetTaggedValue();
}
// 21.2.5.9
@ -2774,4 +2788,49 @@ JSTaggedValue BuiltinsRegExp::GetExecResultGroups(JSThread *thread, const JSHand
RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread);
return groups;
}
JSHandle<EcmaString> BuiltinsRegExp::CreateStringFromResultArray(JSThread *thread,
const JSHandle<TaggedArray> resultArray, const std::vector<uint64_t> &resultLengthArray,
JSHandle<EcmaString> srcString, uint32_t resultStrLength, bool isUtf8)
{
JSHandle<EcmaString> result = JSHandle<EcmaString>(thread,
EcmaStringAccessor::CreateLineString(thread->GetEcmaVM(), resultStrLength, isUtf8));
FlatStringInfo resultInfo = FlatStringInfo(*result, 0, resultStrLength);
FlatStringInfo flatStrInfo = EcmaStringAccessor::FlattenAllString(thread->GetEcmaVM(), srcString);
if (EcmaStringAccessor(srcString).IsTreeString()) { // use flattenedString as srcString
srcString = JSHandle<EcmaString>(thread, flatStrInfo.GetString());
}
uint32_t nextPos = 0;
uint32_t resultArrayLength = resultArray->GetLength();
for (int i = 0; i < static_cast<int>(resultArrayLength); i++) {
JSTaggedValue substrValue = resultArray->Get(thread, i);
if (substrValue.IsHole()) {
continue;
}
resultInfo.SetStartIndex(nextPos);
if (substrValue.IsUndefined()) {
uint64_t bits = resultLengthArray[i];
uint32_t subLength = ReplaceLengthField::Decode(bits);
uint32_t subPosition = ReplacePositionField::Decode(bits);
if (isUtf8) {
EcmaStringAccessor::WriteToFlatWithPos<uint8_t>(*srcString, resultInfo.GetDataUtf8Writable(),
subLength, subPosition);
} else {
EcmaStringAccessor::WriteToFlatWithPos<uint16_t>(*srcString, resultInfo.GetDataUtf16Writable(),
subLength, subPosition);
}
nextPos += subLength;
} else {
EcmaString *replacementStr = EcmaString::Cast(substrValue.GetTaggedObject());
uint32_t replaceLength = static_cast<uint32_t>(resultLengthArray[i]);
if (isUtf8) {
EcmaStringAccessor::WriteToFlat(replacementStr, resultInfo.GetDataUtf8Writable(), replaceLength);
} else {
EcmaStringAccessor::WriteToFlat(replacementStr, resultInfo.GetDataUtf16Writable(), replaceLength);
}
nextPos += replaceLength;
}
}
return result;
}
} // namespace panda::ecmascript::builtins

View File

@ -129,6 +129,12 @@ private:
static constexpr uint32_t EXEC_RESULT_INPUT_OFFSET = 2;
static constexpr uint32_t EXEC_RESULT_GROUPS_OFFSET = 3;
static constexpr uint32_t REPLACE_RESULT_VAL = 2;
static constexpr unsigned REPLACE_LENGTH_BITS = 30;
static constexpr unsigned REPLACE_POSITION_BITS = 30;
using ReplaceLengthField = BitField<uint32_t, 0, REPLACE_LENGTH_BITS>; // 30
using ReplacePositionField = ReplaceLengthField::NextField<uint32_t, REPLACE_POSITION_BITS>; // 60
static bool Matcher(JSThread *thread, const JSHandle<JSTaggedValue> regexp,
const uint8_t *buffer, size_t length, int32_t lastindex, bool isUtf16);
@ -160,6 +166,9 @@ private:
static JSTaggedValue RegExpSplitFast(JSThread *thread, const JSHandle<JSTaggedValue> regexp,
JSHandle<JSTaggedValue> string, uint32_t limit, bool useCache);
static bool GetOringinalFlag(JSThread *thread, const JSHandle<JSTaggedValue> regexp, uint32_t flag);
static JSHandle<EcmaString> CreateStringFromResultArray(JSThread *thread, const JSHandle<TaggedArray> resultArray,
const std::vector<uint64_t> &resultLengthArray, JSHandle<EcmaString> srcString,
uint32_t resultStrLength, bool isUtf8);
};
class RegExpExecResultCache : public TaggedArray {

View File

@ -1665,7 +1665,7 @@ JSTaggedValue BuiltinsString::CreateArrayFromString(JSThread *thread, EcmaVM *ec
{
bool isUtf8 = EcmaStringAccessor(thisString).IsUtf8();
bool canBeCompressed = false;
if (EcmaStringAccessor(thisString).IsLineString() || EcmaStringAccessor(thisString).IsConstantString()) {
if (EcmaStringAccessor(thisString).IsLineOrConstantString()) {
canBeCompressed = EcmaStringAccessor::CanBeCompressed(*thisString);
}
bool isOneByte = isUtf8 & canBeCompressed;

View File

@ -423,6 +423,55 @@ void EcmaString::WriteToFlat(EcmaString *src, Char *buf, uint32_t maxLength)
}
}
template <typename Char>
void EcmaString::WriteToFlatWithPos(EcmaString *src, Char *buf, uint32_t length, uint32_t pos)
{
DISALLOW_GARBAGE_COLLECTION;
[[ maybe_unused ]] uint32_t maxLength = src->GetLength();
if (length == 0) {
return;
}
while (true) {
ASSERT(length + pos <= maxLength && length > 0);
ASSERT(length <= src->GetLength());
ASSERT(pos >= 0);
switch (src->GetStringType()) {
case JSType::LINE_STRING: {
if (src->IsUtf8()) {
CopyChars(buf, src->GetDataUtf8() + pos, length);
} else {
CopyChars(buf, src->GetDataUtf16() + pos, length);
}
return;
}
case JSType::CONSTANT_STRING: {
ASSERT(src->IsUtf8());
CopyChars(buf, src->GetDataUtf8() + pos, length);
return;
}
case JSType::TREE_STRING: {
TreeEcmaString *treeSrc = TreeEcmaString::Cast(src);
EcmaString *first = EcmaString::Cast(treeSrc->GetFirst());
ASSERT(first->IsLineString());
src = first;
continue;
}
case JSType::SLICED_STRING: {
EcmaString *parent = EcmaString::Cast(SlicedString::Cast(src)->GetParent());
if (src->IsUtf8()) {
CopyChars(buf, parent->GetDataUtf8() + SlicedString::Cast(src)->GetStartIndex() + pos, length);
} else {
CopyChars(buf, parent->GetDataUtf16() + SlicedString::Cast(src)->GetStartIndex() + pos, length);
}
return;
}
default:
LOG_ECMA(FATAL) << "this branch is unreachable";
UNREACHABLE();
}
}
}
inline const uint8_t *FlatStringInfo::GetDataUtf8() const
{
return string_->GetDataUtf8() + startIndex_;
@ -438,6 +487,11 @@ inline uint8_t *FlatStringInfo::GetDataUtf8Writable() const
return string_->GetDataUtf8Writable() + startIndex_;
}
inline uint16_t *FlatStringInfo::GetDataUtf16Writable() const
{
return string_->GetDataUtf16Writable() + startIndex_;
}
inline const uint8_t *EcmaStringAccessor::GetDataUtf8()
{
return string_->GetDataUtf8();

View File

@ -711,6 +711,9 @@ private:
template <typename Char>
static void WriteToFlat(EcmaString *src, Char *buf, uint32_t maxLength);
template <typename Char>
static void WriteToFlatWithPos(EcmaString *src, Char *buf, uint32_t length, uint32_t pos);
static const uint8_t *PUBLIC_API GetUtf8DataFlat(const EcmaString *src, CVector<uint8_t> &buf);
static const uint16_t *PUBLIC_API GetUtf16DataFlat(const EcmaString *src, CVector<uint16_t> &buf);
@ -1024,6 +1027,11 @@ public:
return startIndex_;
}
void SetStartIndex(uint32_t index)
{
startIndex_ = index;
}
uint32_t GetLength() const
{
return length_;
@ -1032,6 +1040,7 @@ public:
const uint8_t *GetDataUtf8() const;
const uint16_t *GetDataUtf16() const;
uint8_t *GetDataUtf8Writable() const;
uint16_t *GetDataUtf16Writable() const;
std::u16string ToU16String(uint32_t len = 0);
private:
EcmaString *string_ {nullptr};
@ -1248,6 +1257,18 @@ public:
return string_->CopyDataUtf16(buf, maxLength);
}
template <typename Char>
static void WriteToFlatWithPos(EcmaString *src, Char *buf, uint32_t length, uint32_t pos)
{
src->WriteToFlatWithPos(src, buf, length, pos);
}
template <typename Char>
static void WriteToFlat(EcmaString *src, Char *buf, uint32_t maxLength)
{
src->WriteToFlat(src, buf, maxLength);
}
// require dst is LineString
// not change src data structure.
// if src is not flat, this func has low efficiency.

View File

@ -12,3 +12,17 @@
# limitations under the License.
[1.1]
这是一段lineStringX1这是替换的字符串A2
这是一段lineStringX1这是替换的字符串X2
这是一段treeStringX1这是替换的字符串B2
这是一段treeStringX1这是替换的字符串X2
这是一段slicedStringX1这是要替换的字符串C2
这是一段slicedStringX1这是要替换的字符串X2
aaaaxxxxxxxyyyyyyyyybbbbxxxxxxxyyyyyyyyyccccxxxxxxxyyyyyyyyyaaaabbbbcccc
aaaaxxxxxxxxxxxxxxbbbbxxxxxxxxxxxxxxccccxxxxxxxxxxxxxxaaaabbbbcccc
aaaaxxxxxxxxxxxxxxyyyyyyyybbbbxxxxxxxxxxxxxxyyyyyyyyccccxxxxxxxxxxxxxxyyyyyyyyaaaabbbbcccc
aaaaxxxxxxxxxxxxxxybbbbxxxxxxxxxxxxxxyccccxxxxxxxxxxxxxxyaaaabbbbcccc
aaaa哈哈哈哈哈哈哈嘻嘻嘻嘻嘻嘻嘻嘻嘻bbbb哈哈哈哈哈哈哈嘻嘻嘻嘻嘻嘻嘻嘻嘻cccc哈哈哈哈哈哈哈嘻嘻嘻嘻嘻嘻嘻嘻嘻aaaabbbbcccc
aaaa哈哈哈哈哈哈哈哈哈哈哈哈哈哈bbbb哈哈哈哈哈哈哈哈哈哈哈哈哈哈cccc哈哈哈哈哈哈哈哈哈哈哈哈哈哈aaaabbbbcccc
aaaa哈哈哈哈哈哈哈哈哈嘻嘻嘻嘻嘻嘻嘻bbbb哈哈哈哈哈哈哈哈哈嘻嘻嘻嘻嘻嘻嘻cccc哈哈哈哈哈哈哈哈哈嘻嘻嘻嘻嘻嘻嘻aaaabbbbcccc
aaaa哈哈哈哈哈哈哈哈哈嘻bbbb哈哈哈哈哈哈哈哈哈嘻cccc哈哈哈哈哈哈哈哈哈嘻aaaabbbbcccc

View File

@ -29,4 +29,75 @@ r.exec = function() {
return coercibleValue;
};
let a = r[Symbol.replace]('', '[$<length>]');
print(a)
print(a)
let lineString1 = "这是一段lineStringA1这是替换的字符串A2"
let treeString1 = "这是一段treeString".concat("B1这是替换的字符串B2")
let slicedString = "这是一段slicedStringC1这是要替换的字符串C2xxxxxxxx".slice(0, 30);
var re1 = /[ABC]/;
var re2 = /[ABC]/g;
var res1 = lineString1.replace(re1, "X");
var res2 = lineString1.replace(re2, "X");
var res3 = treeString1.replace(re1, "X");
var res4 = treeString1.replace(re2, "X");
var res5 = slicedString.replace(re1, "X");
var res6 = slicedString.replace(re2, "X");
print(res1)
print(res2)
print(res3)
print(res4)
print(res5)
print(res6)
let lineString2 = "aaaaAbbbbBccccCaaaabbbbcccc"
function func1() {
return "xxxxxxx".concat("yyyyyyyyy")
}
var res = lineString2.replace(re2, func1);
print(res)
function func2() {
return "xxxxxxx".concat("xxxxxxx")
}
res = lineString2.replace(re2, func2);
print(res)
function func3() {
return "xxxxxxxxxxxxxx".concat("yyyyyyyy")
}
res = lineString2.replace(re2, func3);
print(res)
function func4() {
return "xxxxxxxxxxxxxx".concat("y")
}
res = lineString2.replace(re2, func4);
print(res)
function func5() {
return "哈哈哈哈哈哈哈".concat("嘻嘻嘻嘻嘻嘻嘻嘻嘻")
}
res = lineString2.replace(re2, func5);
print(res)
function func6() {
return "哈哈哈哈哈哈哈".concat("哈哈哈哈哈哈哈")
}
res = lineString2.replace(re2, func6);
print(res)
function func7() {
return "哈哈哈哈哈哈哈哈哈".concat("嘻嘻嘻嘻嘻嘻嘻")
}
res = lineString2.replace(re2, func7);
print(res)
function func8() {
return "哈哈哈哈哈哈哈哈哈".concat("嘻")
}
res = lineString2.replace(re2, func8);
print(res)

View File

@ -76,4 +76,8 @@ print(res15[0] == res16[0]);
var a = "12345678910"
var b = "12345678910"
var c = a.concat(b);
c.split("")
c.split("")
// Test split string is sliced string
var d = a.slice(4)
d.split("")