mirror of
https://github.com/darlinghq/darling-JavaScriptCore.git
synced 2024-11-23 04:09:40 +00:00
2482 lines
101 KiB
C++
2482 lines
101 KiB
C++
/*
|
|
* Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
|
|
* Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "config.h"
|
|
#include "YarrInterpreter.h"
|
|
|
|
#include "Options.h"
|
|
#include "SuperSampler.h"
|
|
#include "Yarr.h"
|
|
#include "YarrCanonicalize.h"
|
|
#include <wtf/BumpPointerAllocator.h>
|
|
#include <wtf/CheckedArithmetic.h>
|
|
#include <wtf/DataLog.h>
|
|
#include <wtf/StackCheck.h>
|
|
#include <wtf/text/WTFString.h>
|
|
|
|
namespace JSC { namespace Yarr {
|
|
|
|
template<typename CharType>
|
|
class Interpreter {
|
|
public:
|
|
struct ParenthesesDisjunctionContext;
|
|
|
|
struct BackTrackInfoParentheses {
|
|
uintptr_t begin;
|
|
uintptr_t matchAmount;
|
|
ParenthesesDisjunctionContext* lastContext;
|
|
};
|
|
|
|
static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
|
|
{
|
|
context->next = backTrack->lastContext;
|
|
backTrack->lastContext = context;
|
|
++backTrack->matchAmount;
|
|
}
|
|
|
|
static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
|
|
{
|
|
RELEASE_ASSERT(backTrack->matchAmount);
|
|
RELEASE_ASSERT(backTrack->lastContext);
|
|
backTrack->lastContext = backTrack->lastContext->next;
|
|
--backTrack->matchAmount;
|
|
}
|
|
|
|
struct DisjunctionContext
|
|
{
|
|
DisjunctionContext() = default;
|
|
|
|
void* operator new(size_t, void* where)
|
|
{
|
|
return where;
|
|
}
|
|
|
|
static size_t allocationSize(unsigned numberOfFrames)
|
|
{
|
|
static_assert(alignof(DisjunctionContext) <= sizeof(void*), "");
|
|
size_t rawSize = (sizeof(DisjunctionContext) - sizeof(uintptr_t) + Checked<size_t>(numberOfFrames) * sizeof(uintptr_t)).unsafeGet();
|
|
size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize);
|
|
RELEASE_ASSERT(roundedSize >= rawSize);
|
|
return roundedSize;
|
|
}
|
|
|
|
int term { 0 };
|
|
unsigned matchBegin;
|
|
unsigned matchEnd;
|
|
uintptr_t frame[1];
|
|
};
|
|
|
|
DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
|
|
{
|
|
size_t size = DisjunctionContext::allocationSize(disjunction->m_frameSize);
|
|
allocatorPool = allocatorPool->ensureCapacity(size);
|
|
RELEASE_ASSERT(allocatorPool);
|
|
return new (allocatorPool->alloc(size)) DisjunctionContext();
|
|
}
|
|
|
|
void freeDisjunctionContext(DisjunctionContext* context)
|
|
{
|
|
allocatorPool = allocatorPool->dealloc(context);
|
|
}
|
|
|
|
struct ParenthesesDisjunctionContext
|
|
{
|
|
ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
|
|
{
|
|
unsigned firstSubpatternId = term.atom.subpatternId;
|
|
unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
|
|
|
|
for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
|
|
subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
|
|
output[(firstSubpatternId << 1) + i] = offsetNoMatch;
|
|
}
|
|
|
|
new (getDisjunctionContext(term)) DisjunctionContext();
|
|
}
|
|
|
|
void* operator new(size_t, void* where)
|
|
{
|
|
return where;
|
|
}
|
|
|
|
void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
|
|
{
|
|
for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
|
|
output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
|
|
}
|
|
|
|
DisjunctionContext* getDisjunctionContext(ByteTerm& term)
|
|
{
|
|
return bitwise_cast<DisjunctionContext*>(bitwise_cast<uintptr_t>(this) + allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns));
|
|
}
|
|
|
|
static size_t allocationSize(unsigned numberOfSubpatterns)
|
|
{
|
|
static_assert(alignof(ParenthesesDisjunctionContext) <= sizeof(void*), "");
|
|
size_t rawSize = (sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (Checked<size_t>(numberOfSubpatterns) * 2U) * sizeof(unsigned)).unsafeGet();
|
|
size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize);
|
|
RELEASE_ASSERT(roundedSize >= rawSize);
|
|
return roundedSize;
|
|
}
|
|
|
|
ParenthesesDisjunctionContext* next { nullptr };
|
|
unsigned subpatternBackup[1];
|
|
};
|
|
|
|
ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
|
|
{
|
|
size_t size = (Checked<size_t>(ParenthesesDisjunctionContext::allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)) + DisjunctionContext::allocationSize(disjunction->m_frameSize)).unsafeGet();
|
|
allocatorPool = allocatorPool->ensureCapacity(size);
|
|
RELEASE_ASSERT(allocatorPool);
|
|
return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
|
|
}
|
|
|
|
void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
|
|
{
|
|
allocatorPool = allocatorPool->dealloc(context);
|
|
}
|
|
|
|
class InputStream {
|
|
public:
|
|
InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs)
|
|
: input(input)
|
|
, pos(start)
|
|
, length(length)
|
|
, decodeSurrogatePairs(decodeSurrogatePairs)
|
|
{
|
|
}
|
|
|
|
void next()
|
|
{
|
|
++pos;
|
|
}
|
|
|
|
void rewind(unsigned amount)
|
|
{
|
|
ASSERT(pos >= amount);
|
|
pos -= amount;
|
|
}
|
|
|
|
int read()
|
|
{
|
|
ASSERT(pos < length);
|
|
if (pos < length)
|
|
return input[pos];
|
|
return -1;
|
|
}
|
|
|
|
int readPair()
|
|
{
|
|
ASSERT(pos + 1 < length);
|
|
return input[pos] | input[pos + 1] << 16;
|
|
}
|
|
|
|
int readChecked(unsigned negativePositionOffest)
|
|
{
|
|
RELEASE_ASSERT(pos >= negativePositionOffest);
|
|
unsigned p = pos - negativePositionOffest;
|
|
ASSERT(p < length);
|
|
int result = input[p];
|
|
if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) {
|
|
if (atEnd())
|
|
return -1;
|
|
|
|
result = U16_GET_SUPPLEMENTARY(result, input[p + 1]);
|
|
next();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int readSurrogatePairChecked(unsigned negativePositionOffset)
|
|
{
|
|
RELEASE_ASSERT(pos >= negativePositionOffset);
|
|
unsigned p = pos - negativePositionOffset;
|
|
ASSERT(p < length);
|
|
if (p + 1 >= length)
|
|
return -1;
|
|
|
|
int first = input[p];
|
|
int second = input[p + 1];
|
|
if (U16_IS_LEAD(first) && U16_IS_TRAIL(second))
|
|
return U16_GET_SUPPLEMENTARY(first, second);
|
|
|
|
return -1;
|
|
}
|
|
|
|
int reread(unsigned from)
|
|
{
|
|
ASSERT(from < length);
|
|
int result = input[from];
|
|
if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1]))
|
|
result = U16_GET_SUPPLEMENTARY(result, input[from + 1]);
|
|
return result;
|
|
}
|
|
|
|
int prev()
|
|
{
|
|
ASSERT(!(pos > length));
|
|
if (pos && length)
|
|
return input[pos - 1];
|
|
return -1;
|
|
}
|
|
|
|
unsigned getPos()
|
|
{
|
|
return pos;
|
|
}
|
|
|
|
void setPos(unsigned p)
|
|
{
|
|
pos = p;
|
|
}
|
|
|
|
bool atStart()
|
|
{
|
|
return pos == 0;
|
|
}
|
|
|
|
bool atEnd()
|
|
{
|
|
return pos == length;
|
|
}
|
|
|
|
unsigned end()
|
|
{
|
|
return length;
|
|
}
|
|
|
|
bool checkInput(unsigned count)
|
|
{
|
|
if (((pos + count) <= length) && ((pos + count) >= pos)) {
|
|
pos += count;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void uncheckInput(unsigned count)
|
|
{
|
|
RELEASE_ASSERT(pos >= count);
|
|
pos -= count;
|
|
}
|
|
|
|
bool atStart(unsigned negativePositionOffset)
|
|
{
|
|
return pos == negativePositionOffset;
|
|
}
|
|
|
|
bool atEnd(unsigned negativePositionOffest)
|
|
{
|
|
RELEASE_ASSERT(pos >= negativePositionOffest);
|
|
return (pos - negativePositionOffest) == length;
|
|
}
|
|
|
|
bool isAvailableInput(unsigned offset)
|
|
{
|
|
return (((pos + offset) <= length) && ((pos + offset) >= pos));
|
|
}
|
|
|
|
private:
|
|
const CharType* input;
|
|
unsigned pos;
|
|
unsigned length;
|
|
bool decodeSurrogatePairs;
|
|
};
|
|
|
|
bool testCharacterClass(CharacterClass* characterClass, int ch)
|
|
{
|
|
auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) {
|
|
for (unsigned i = 0; i < matches.size(); ++i) {
|
|
if (ch == matches[i])
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) {
|
|
size_t low = 0;
|
|
size_t high = matches.size() - 1;
|
|
|
|
while (low <= high) {
|
|
size_t mid = low + (high - low) / 2;
|
|
int diff = ch - matches[mid];
|
|
if (!diff)
|
|
return true;
|
|
|
|
if (diff < 0) {
|
|
if (mid == low)
|
|
return false;
|
|
high = mid - 1;
|
|
} else
|
|
low = mid + 1;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
|
|
for (unsigned i = 0; i < ranges.size(); ++i) {
|
|
if ((ch >= ranges[i].begin) && (ch <= ranges[i].end))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
|
|
size_t low = 0;
|
|
size_t high = ranges.size() - 1;
|
|
|
|
while (low <= high) {
|
|
size_t mid = low + (high - low) / 2;
|
|
int rangeBeginDiff = ch - ranges[mid].begin;
|
|
if (rangeBeginDiff >= 0 && ch <= ranges[mid].end)
|
|
return true;
|
|
|
|
if (rangeBeginDiff < 0) {
|
|
if (mid == low)
|
|
return false;
|
|
high = mid - 1;
|
|
} else
|
|
low = mid + 1;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
if (characterClass->m_anyCharacter)
|
|
return true;
|
|
|
|
const size_t thresholdForBinarySearch = 6;
|
|
|
|
if (!isASCII(ch)) {
|
|
if (characterClass->m_matchesUnicode.size()) {
|
|
if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) {
|
|
if (binarySearchMatches(characterClass->m_matchesUnicode))
|
|
return true;
|
|
} else if (linearSearchMatches(characterClass->m_matchesUnicode))
|
|
return true;
|
|
}
|
|
|
|
if (characterClass->m_rangesUnicode.size()) {
|
|
if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) {
|
|
if (binarySearchRanges(characterClass->m_rangesUnicode))
|
|
return true;
|
|
} else if (linearSearchRanges(characterClass->m_rangesUnicode))
|
|
return true;
|
|
}
|
|
} else {
|
|
if (characterClass->m_matches.size()) {
|
|
if (characterClass->m_matches.size() > thresholdForBinarySearch) {
|
|
if (binarySearchMatches(characterClass->m_matches))
|
|
return true;
|
|
} else if (linearSearchMatches(characterClass->m_matches))
|
|
return true;
|
|
}
|
|
|
|
if (characterClass->m_ranges.size()) {
|
|
if (characterClass->m_ranges.size() > thresholdForBinarySearch) {
|
|
if (binarySearchRanges(characterClass->m_ranges))
|
|
return true;
|
|
} else if (linearSearchRanges(characterClass->m_ranges))
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool checkCharacter(int testChar, unsigned negativeInputOffset)
|
|
{
|
|
return testChar == input.readChecked(negativeInputOffset);
|
|
}
|
|
|
|
bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset)
|
|
{
|
|
return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset);
|
|
}
|
|
|
|
bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
|
|
{
|
|
int ch = input.readChecked(negativeInputOffset);
|
|
return (loChar == ch) || (hiChar == ch);
|
|
}
|
|
|
|
bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
|
|
{
|
|
bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
|
|
return invert ? !match : match;
|
|
}
|
|
|
|
bool checkCharacterClassDontAdvanceInputForNonBMP(CharacterClass* characterClass, unsigned negativeInputOffset)
|
|
{
|
|
int readCharacter = characterClass->hasOnlyNonBMPCharacters() ? input.readSurrogatePairChecked(negativeInputOffset) : input.readChecked(negativeInputOffset);
|
|
return testCharacterClass(characterClass, readCharacter);
|
|
}
|
|
|
|
bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
|
|
{
|
|
unsigned matchSize = (unsigned)(matchEnd - matchBegin);
|
|
|
|
if (!input.checkInput(matchSize))
|
|
return false;
|
|
|
|
for (unsigned i = 0; i < matchSize; ++i) {
|
|
int oldCh = input.reread(matchBegin + i);
|
|
int ch;
|
|
if (!U_IS_BMP(oldCh)) {
|
|
ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i);
|
|
++i;
|
|
} else
|
|
ch = input.readChecked(negativeInputOffset + matchSize - i);
|
|
|
|
if (oldCh == ch)
|
|
continue;
|
|
|
|
if (pattern->ignoreCase()) {
|
|
// See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode
|
|
// patterns, Unicode values are never allowed to match against ASCII ones.
|
|
// For Unicode, we need to check all canonical equivalents of a character.
|
|
if (!unicode && (isASCII(oldCh) || isASCII(ch))) {
|
|
if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
|
|
continue;
|
|
} else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2))
|
|
continue;
|
|
}
|
|
|
|
input.uncheckInput(matchSize);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool matchAssertionBOL(ByteTerm& term)
|
|
{
|
|
return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
|
|
}
|
|
|
|
bool matchAssertionEOL(ByteTerm& term)
|
|
{
|
|
if (term.inputPosition)
|
|
return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
|
|
|
|
return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read()));
|
|
}
|
|
|
|
bool matchAssertionWordBoundary(ByteTerm& term)
|
|
{
|
|
bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
|
|
bool readIsWordchar;
|
|
if (term.inputPosition)
|
|
readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
|
|
else
|
|
readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
|
|
|
|
bool wordBoundary = prevIsWordchar != readIsWordchar;
|
|
return term.invert() ? !wordBoundary : wordBoundary;
|
|
}
|
|
|
|
bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierFixedCount:
|
|
break;
|
|
|
|
case QuantifierGreedy:
|
|
if (backTrack->matchAmount) {
|
|
--backTrack->matchAmount;
|
|
input.uncheckInput(U16_LENGTH(term.atom.patternCharacter));
|
|
return true;
|
|
}
|
|
break;
|
|
|
|
case QuantifierNonGreedy:
|
|
if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
|
|
++backTrack->matchAmount;
|
|
if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
|
|
return true;
|
|
}
|
|
input.setPos(backTrack->begin);
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierFixedCount:
|
|
break;
|
|
|
|
case QuantifierGreedy:
|
|
if (backTrack->matchAmount) {
|
|
--backTrack->matchAmount;
|
|
input.uncheckInput(1);
|
|
return true;
|
|
}
|
|
break;
|
|
|
|
case QuantifierNonGreedy:
|
|
if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
|
|
++backTrack->matchAmount;
|
|
if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
|
|
return true;
|
|
}
|
|
input.uncheckInput(backTrack->matchAmount);
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeCharacterClass);
|
|
BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierFixedCount: {
|
|
if (unicode) {
|
|
CharacterClass* charClass = term.atom.characterClass;
|
|
backTrack->begin = input.getPos();
|
|
unsigned matchAmount = 0;
|
|
for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
|
|
if (term.invert()) {
|
|
if (!checkCharacterClass(charClass, term.invert(), term.inputPosition - matchAmount)) {
|
|
input.setPos(backTrack->begin);
|
|
return false;
|
|
}
|
|
} else {
|
|
unsigned matchOffset = matchAmount * (charClass->hasOnlyNonBMPCharacters() ? 2 : 1);
|
|
if (!checkCharacterClassDontAdvanceInputForNonBMP(charClass, term.inputPosition - matchOffset)) {
|
|
input.setPos(backTrack->begin);
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
|
|
if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
case QuantifierGreedy: {
|
|
unsigned position = input.getPos();
|
|
backTrack->begin = position;
|
|
unsigned matchAmount = 0;
|
|
while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
|
|
if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
|
|
input.setPos(position);
|
|
break;
|
|
}
|
|
++matchAmount;
|
|
position = input.getPos();
|
|
}
|
|
backTrack->matchAmount = matchAmount;
|
|
|
|
return true;
|
|
}
|
|
|
|
case QuantifierNonGreedy:
|
|
backTrack->begin = input.getPos();
|
|
backTrack->matchAmount = 0;
|
|
return true;
|
|
}
|
|
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
return false;
|
|
}
|
|
|
|
bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeCharacterClass);
|
|
BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierFixedCount:
|
|
if (unicode)
|
|
input.setPos(backTrack->begin);
|
|
break;
|
|
|
|
case QuantifierGreedy:
|
|
if (backTrack->matchAmount) {
|
|
if (unicode) {
|
|
// Rematch one less match
|
|
input.setPos(backTrack->begin);
|
|
--backTrack->matchAmount;
|
|
for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) {
|
|
if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
|
|
input.uncheckInput(1);
|
|
break;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
--backTrack->matchAmount;
|
|
input.uncheckInput(1);
|
|
return true;
|
|
}
|
|
break;
|
|
|
|
case QuantifierNonGreedy:
|
|
if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
|
|
++backTrack->matchAmount;
|
|
if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
|
|
return true;
|
|
}
|
|
input.setPos(backTrack->begin);
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeBackReference);
|
|
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
|
|
|
|
unsigned matchBegin = output[(term.atom.subpatternId << 1)];
|
|
unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
|
|
|
|
// If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
|
|
// In this case the result of match is empty string like when it references to a parentheses with zero-width match.
|
|
// Eg.: /(a\1)/
|
|
if (matchEnd == offsetNoMatch)
|
|
return true;
|
|
|
|
if (matchBegin == offsetNoMatch)
|
|
return true;
|
|
|
|
ASSERT(matchBegin <= matchEnd);
|
|
|
|
if (matchBegin == matchEnd)
|
|
return true;
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierFixedCount: {
|
|
backTrack->begin = input.getPos();
|
|
for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
|
|
if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
|
|
input.setPos(backTrack->begin);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
case QuantifierGreedy: {
|
|
unsigned matchAmount = 0;
|
|
while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
|
|
++matchAmount;
|
|
backTrack->matchAmount = matchAmount;
|
|
return true;
|
|
}
|
|
|
|
case QuantifierNonGreedy:
|
|
backTrack->begin = input.getPos();
|
|
backTrack->matchAmount = 0;
|
|
return true;
|
|
}
|
|
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
return false;
|
|
}
|
|
|
|
bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeBackReference);
|
|
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
|
|
|
|
unsigned matchBegin = output[(term.atom.subpatternId << 1)];
|
|
unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
|
|
|
|
if (matchBegin == offsetNoMatch)
|
|
return false;
|
|
|
|
ASSERT(matchBegin <= matchEnd);
|
|
|
|
if (matchBegin == matchEnd)
|
|
return false;
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierFixedCount:
|
|
// for quantityMaxCount == 1, could rewind.
|
|
input.setPos(backTrack->begin);
|
|
break;
|
|
|
|
case QuantifierGreedy:
|
|
if (backTrack->matchAmount) {
|
|
--backTrack->matchAmount;
|
|
input.rewind(matchEnd - matchBegin);
|
|
return true;
|
|
}
|
|
break;
|
|
|
|
case QuantifierNonGreedy:
|
|
if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
|
|
++backTrack->matchAmount;
|
|
return true;
|
|
}
|
|
input.setPos(backTrack->begin);
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
|
|
{
|
|
if (term.capture()) {
|
|
unsigned subpatternId = term.atom.subpatternId;
|
|
output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition;
|
|
output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition;
|
|
}
|
|
}
|
|
void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
|
|
{
|
|
unsigned firstSubpatternId = term.atom.subpatternId;
|
|
unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
|
|
context->restoreOutput(output, firstSubpatternId, count);
|
|
}
|
|
JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
|
|
{
|
|
while (backTrack->matchAmount) {
|
|
ParenthesesDisjunctionContext* context = backTrack->lastContext;
|
|
|
|
JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
|
|
if (result == JSRegExpMatch)
|
|
return JSRegExpMatch;
|
|
|
|
resetMatches(term, context);
|
|
popParenthesesDisjunctionContext(backTrack);
|
|
freeParenthesesDisjunctionContext(context);
|
|
|
|
if (result != JSRegExpNoMatch)
|
|
return result;
|
|
}
|
|
|
|
return JSRegExpNoMatch;
|
|
}
|
|
|
|
bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
|
|
ASSERT(term.atom.quantityMaxCount == 1);
|
|
|
|
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierGreedy: {
|
|
// set this speculatively; if we get to the parens end this will be true.
|
|
backTrack->begin = input.getPos();
|
|
break;
|
|
}
|
|
case QuantifierNonGreedy: {
|
|
backTrack->begin = notFound;
|
|
context->term += term.atom.parenthesesWidth;
|
|
return true;
|
|
}
|
|
case QuantifierFixedCount:
|
|
break;
|
|
}
|
|
|
|
if (term.capture()) {
|
|
unsigned subpatternId = term.atom.subpatternId;
|
|
output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
|
|
ASSERT(term.atom.quantityMaxCount == 1);
|
|
|
|
if (term.capture()) {
|
|
unsigned subpatternId = term.atom.subpatternId;
|
|
output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition;
|
|
}
|
|
|
|
if (term.atom.quantityType == QuantifierFixedCount)
|
|
return true;
|
|
|
|
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
|
|
return backTrack->begin != input.getPos();
|
|
}
|
|
|
|
bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
|
|
ASSERT(term.atom.quantityMaxCount == 1);
|
|
|
|
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
|
|
|
|
if (term.capture()) {
|
|
unsigned subpatternId = term.atom.subpatternId;
|
|
output[(subpatternId << 1)] = offsetNoMatch;
|
|
output[(subpatternId << 1) + 1] = offsetNoMatch;
|
|
}
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierGreedy:
|
|
// if we backtrack to this point, there is another chance - try matching nothing.
|
|
ASSERT(backTrack->begin != notFound);
|
|
backTrack->begin = notFound;
|
|
context->term += term.atom.parenthesesWidth;
|
|
return true;
|
|
case QuantifierNonGreedy:
|
|
ASSERT(backTrack->begin != notFound);
|
|
FALLTHROUGH;
|
|
case QuantifierFixedCount:
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
|
|
ASSERT(term.atom.quantityMaxCount == 1);
|
|
|
|
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierGreedy:
|
|
if (backTrack->begin == notFound) {
|
|
context->term -= term.atom.parenthesesWidth;
|
|
return false;
|
|
}
|
|
FALLTHROUGH;
|
|
case QuantifierNonGreedy:
|
|
if (backTrack->begin == notFound) {
|
|
backTrack->begin = input.getPos();
|
|
if (term.capture()) {
|
|
// Technically this access to inputPosition should be accessing the begin term's
|
|
// inputPosition, but for repeats other than fixed these values should be
|
|
// the same anyway! (We don't pre-check for greedy or non-greedy matches.)
|
|
ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
|
|
ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
|
|
unsigned subpatternId = term.atom.subpatternId;
|
|
output[subpatternId << 1] = input.getPos() - term.inputPosition;
|
|
}
|
|
context->term -= term.atom.parenthesesWidth;
|
|
return true;
|
|
}
|
|
FALLTHROUGH;
|
|
case QuantifierFixedCount:
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
|
|
ASSERT(term.atom.quantityType == QuantifierGreedy);
|
|
ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
|
|
ASSERT(!term.capture());
|
|
|
|
BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
|
|
backTrack->begin = input.getPos();
|
|
return true;
|
|
}
|
|
|
|
bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
|
|
|
|
BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
|
|
// Empty match is a failed match.
|
|
if (backTrack->begin == input.getPos())
|
|
return false;
|
|
|
|
// Successful match! Okay, what's next? - loop around and try to match more!
|
|
context->term -= (term.atom.parenthesesWidth + 1);
|
|
return true;
|
|
}
|
|
|
|
bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
|
|
ASSERT(term.atom.quantityType == QuantifierGreedy);
|
|
ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
|
|
ASSERT(!term.capture());
|
|
|
|
// If we backtrack to this point, we have failed to match this iteration of the parens.
|
|
// Since this is greedy / zero minimum a failed is also accepted as a match!
|
|
context->term += term.atom.parenthesesWidth;
|
|
return true;
|
|
}
|
|
|
|
bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
|
|
{
|
|
// 'Terminal' parentheses are at the end of the regex, and as such a match past end
|
|
// should always be returned as a successful match - we should never backtrack to here.
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
return false;
|
|
}
|
|
|
|
bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
|
|
ASSERT(term.atom.quantityMaxCount == 1);
|
|
|
|
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
|
|
|
|
backTrack->begin = input.getPos();
|
|
return true;
|
|
}
|
|
|
|
bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
|
|
ASSERT(term.atom.quantityMaxCount == 1);
|
|
|
|
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
|
|
|
|
input.setPos(backTrack->begin);
|
|
|
|
// We've reached the end of the parens; if they are inverted, this is failure.
|
|
if (term.invert()) {
|
|
context->term -= term.atom.parenthesesWidth;
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
|
|
ASSERT(term.atom.quantityMaxCount == 1);
|
|
|
|
// We've failed to match parens; if they are inverted, this is win!
|
|
if (term.invert()) {
|
|
context->term += term.atom.parenthesesWidth;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
|
|
ASSERT(term.atom.quantityMaxCount == 1);
|
|
|
|
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
|
|
|
|
input.setPos(backTrack->begin);
|
|
|
|
context->term -= term.atom.parenthesesWidth;
|
|
return false;
|
|
}
|
|
|
|
JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
|
|
|
|
BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
|
|
ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
|
|
|
|
backTrack->begin = input.getPos();
|
|
backTrack->matchAmount = 0;
|
|
backTrack->lastContext = nullptr;
|
|
|
|
ASSERT(term.atom.quantityType != QuantifierFixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount);
|
|
|
|
unsigned minimumMatchCount = term.atom.quantityMinCount;
|
|
JSRegExpResult fixedMatchResult;
|
|
|
|
// Handle fixed matches and the minimum part of a variable length match.
|
|
if (minimumMatchCount) {
|
|
// While we haven't yet reached our fixed limit,
|
|
while (backTrack->matchAmount < minimumMatchCount) {
|
|
// Try to do a match, and it it succeeds, add it to the list.
|
|
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
|
|
fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
|
|
if (fixedMatchResult == JSRegExpMatch)
|
|
appendParenthesesDisjunctionContext(backTrack, context);
|
|
else {
|
|
// The match failed; try to find an alternate point to carry on from.
|
|
resetMatches(term, context);
|
|
freeParenthesesDisjunctionContext(context);
|
|
|
|
if (fixedMatchResult != JSRegExpNoMatch)
|
|
return fixedMatchResult;
|
|
JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
|
|
if (backtrackResult != JSRegExpMatch)
|
|
return backtrackResult;
|
|
}
|
|
}
|
|
|
|
ParenthesesDisjunctionContext* context = backTrack->lastContext;
|
|
recordParenthesesMatch(term, context);
|
|
}
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierFixedCount: {
|
|
ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
|
|
return JSRegExpMatch;
|
|
}
|
|
|
|
case QuantifierGreedy: {
|
|
while (backTrack->matchAmount < term.atom.quantityMaxCount) {
|
|
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
|
|
JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
|
|
if (result == JSRegExpMatch)
|
|
appendParenthesesDisjunctionContext(backTrack, context);
|
|
else {
|
|
resetMatches(term, context);
|
|
freeParenthesesDisjunctionContext(context);
|
|
|
|
if (result != JSRegExpNoMatch)
|
|
return result;
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (backTrack->matchAmount) {
|
|
ParenthesesDisjunctionContext* context = backTrack->lastContext;
|
|
recordParenthesesMatch(term, context);
|
|
}
|
|
return JSRegExpMatch;
|
|
}
|
|
|
|
case QuantifierNonGreedy:
|
|
return JSRegExpMatch;
|
|
}
|
|
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
return JSRegExpErrorNoMatch;
|
|
}
|
|
|
|
// Rules for backtracking differ depending on whether this is greedy or non-greedy.
|
|
//
|
|
// Greedy matches never should try just adding more - you should already have done
|
|
// the 'more' cases. Always backtrack, at least a leetle bit. However cases where
|
|
// you backtrack an item off the list needs checking, since we'll never have matched
|
|
// the one less case. Tracking forwards, still add as much as possible.
|
|
//
|
|
// Non-greedy, we've already done the one less case, so don't match on popping.
|
|
// We haven't done the one more case, so always try to add that.
|
|
//
|
|
JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
|
|
|
|
BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
|
|
ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
|
|
|
|
switch (term.atom.quantityType) {
|
|
case QuantifierFixedCount: {
|
|
ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
|
|
|
|
ParenthesesDisjunctionContext* context = nullptr;
|
|
JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
|
|
|
|
if (result != JSRegExpMatch)
|
|
return result;
|
|
|
|
// While we haven't yet reached our fixed limit,
|
|
while (backTrack->matchAmount < term.atom.quantityMaxCount) {
|
|
// Try to do a match, and it it succeeds, add it to the list.
|
|
context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
|
|
result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
|
|
|
|
if (result == JSRegExpMatch)
|
|
appendParenthesesDisjunctionContext(backTrack, context);
|
|
else {
|
|
// The match failed; try to find an alternate point to carry on from.
|
|
resetMatches(term, context);
|
|
freeParenthesesDisjunctionContext(context);
|
|
|
|
if (result != JSRegExpNoMatch)
|
|
return result;
|
|
JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
|
|
if (backtrackResult != JSRegExpMatch)
|
|
return backtrackResult;
|
|
}
|
|
}
|
|
|
|
ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
|
|
context = backTrack->lastContext;
|
|
recordParenthesesMatch(term, context);
|
|
return JSRegExpMatch;
|
|
}
|
|
|
|
case QuantifierGreedy: {
|
|
if (!backTrack->matchAmount)
|
|
return JSRegExpNoMatch;
|
|
|
|
ParenthesesDisjunctionContext* context = backTrack->lastContext;
|
|
JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
|
|
if (result == JSRegExpMatch) {
|
|
while (backTrack->matchAmount < term.atom.quantityMaxCount) {
|
|
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
|
|
JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
|
|
if (parenthesesResult == JSRegExpMatch)
|
|
appendParenthesesDisjunctionContext(backTrack, context);
|
|
else {
|
|
resetMatches(term, context);
|
|
freeParenthesesDisjunctionContext(context);
|
|
|
|
if (parenthesesResult != JSRegExpNoMatch)
|
|
return parenthesesResult;
|
|
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
resetMatches(term, context);
|
|
popParenthesesDisjunctionContext(backTrack);
|
|
freeParenthesesDisjunctionContext(context);
|
|
|
|
if (backTrack->matchAmount < term.atom.quantityMinCount) {
|
|
while (backTrack->matchAmount) {
|
|
context = backTrack->lastContext;
|
|
resetMatches(term, context);
|
|
popParenthesesDisjunctionContext(backTrack);
|
|
freeParenthesesDisjunctionContext(context);
|
|
}
|
|
|
|
input.setPos(backTrack->begin);
|
|
return result;
|
|
}
|
|
|
|
if (result != JSRegExpNoMatch)
|
|
return result;
|
|
}
|
|
|
|
if (backTrack->matchAmount) {
|
|
ParenthesesDisjunctionContext* context = backTrack->lastContext;
|
|
recordParenthesesMatch(term, context);
|
|
}
|
|
return JSRegExpMatch;
|
|
}
|
|
|
|
case QuantifierNonGreedy: {
|
|
// If we've not reached the limit, try to add one more match.
|
|
if (backTrack->matchAmount < term.atom.quantityMaxCount) {
|
|
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
|
|
JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
|
|
if (result == JSRegExpMatch) {
|
|
appendParenthesesDisjunctionContext(backTrack, context);
|
|
recordParenthesesMatch(term, context);
|
|
return JSRegExpMatch;
|
|
}
|
|
|
|
resetMatches(term, context);
|
|
freeParenthesesDisjunctionContext(context);
|
|
|
|
if (result != JSRegExpNoMatch)
|
|
return result;
|
|
}
|
|
|
|
// Nope - okay backtrack looking for an alternative.
|
|
while (backTrack->matchAmount) {
|
|
ParenthesesDisjunctionContext* context = backTrack->lastContext;
|
|
JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
|
|
if (result == JSRegExpMatch) {
|
|
// successful backtrack! we're back in the game!
|
|
if (backTrack->matchAmount) {
|
|
context = backTrack->lastContext;
|
|
recordParenthesesMatch(term, context);
|
|
}
|
|
return JSRegExpMatch;
|
|
}
|
|
|
|
// pop a match off the stack
|
|
resetMatches(term, context);
|
|
popParenthesesDisjunctionContext(backTrack);
|
|
freeParenthesesDisjunctionContext(context);
|
|
|
|
if (result != JSRegExpNoMatch)
|
|
return result;
|
|
}
|
|
|
|
return JSRegExpNoMatch;
|
|
}
|
|
}
|
|
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
return JSRegExpErrorNoMatch;
|
|
}
|
|
|
|
bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
|
|
{
|
|
UNUSED_PARAM(term);
|
|
|
|
if (pattern->dotAll()) {
|
|
context->matchBegin = startOffset;
|
|
context->matchEnd = input.end();
|
|
return true;
|
|
}
|
|
|
|
unsigned matchBegin = context->matchBegin;
|
|
|
|
if (matchBegin > startOffset) {
|
|
for (matchBegin--; true; matchBegin--) {
|
|
if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
|
|
++matchBegin;
|
|
break;
|
|
}
|
|
|
|
if (matchBegin == startOffset)
|
|
break;
|
|
}
|
|
}
|
|
|
|
unsigned matchEnd = input.getPos();
|
|
|
|
for (; (matchEnd != input.end())
|
|
&& (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
|
|
|
|
if (((matchBegin && term.anchors.m_bol)
|
|
|| ((matchEnd != input.end()) && term.anchors.m_eol))
|
|
&& !pattern->multiline())
|
|
return false;
|
|
|
|
context->matchBegin = matchBegin;
|
|
context->matchEnd = matchEnd;
|
|
return true;
|
|
}
|
|
|
|
#define MATCH_NEXT() { ++context->term; goto matchAgain; }
|
|
#define BACKTRACK() { --context->term; goto backtrack; }
|
|
#define currentTerm() (disjunction->terms[context->term])
|
|
JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
|
|
{
|
|
if (UNLIKELY(!isSafeToRecurse()))
|
|
return JSRegExpErrorNoMemory;
|
|
|
|
if (!--remainingMatchCount)
|
|
return JSRegExpErrorHitLimit;
|
|
|
|
if (btrack)
|
|
BACKTRACK();
|
|
|
|
context->matchBegin = input.getPos();
|
|
context->term = 0;
|
|
|
|
matchAgain:
|
|
ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
|
|
|
|
switch (currentTerm().type) {
|
|
case ByteTerm::TypeSubpatternBegin:
|
|
MATCH_NEXT();
|
|
case ByteTerm::TypeSubpatternEnd:
|
|
context->matchEnd = input.getPos();
|
|
return JSRegExpMatch;
|
|
|
|
case ByteTerm::TypeBodyAlternativeBegin:
|
|
MATCH_NEXT();
|
|
case ByteTerm::TypeBodyAlternativeDisjunction:
|
|
case ByteTerm::TypeBodyAlternativeEnd:
|
|
context->matchEnd = input.getPos();
|
|
return JSRegExpMatch;
|
|
|
|
case ByteTerm::TypeAlternativeBegin:
|
|
MATCH_NEXT();
|
|
case ByteTerm::TypeAlternativeDisjunction:
|
|
case ByteTerm::TypeAlternativeEnd: {
|
|
int offset = currentTerm().alternative.end;
|
|
BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
|
|
backTrack->offset = offset;
|
|
context->term += offset;
|
|
MATCH_NEXT();
|
|
}
|
|
|
|
case ByteTerm::TypeAssertionBOL:
|
|
if (matchAssertionBOL(currentTerm()))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeAssertionEOL:
|
|
if (matchAssertionEOL(currentTerm()))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeAssertionWordBoundary:
|
|
if (matchAssertionWordBoundary(currentTerm()))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
|
|
case ByteTerm::TypePatternCharacterOnce:
|
|
case ByteTerm::TypePatternCharacterFixed: {
|
|
if (unicode) {
|
|
if (!U_IS_BMP(currentTerm().atom.patternCharacter)) {
|
|
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
|
|
if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) {
|
|
BACKTRACK();
|
|
}
|
|
}
|
|
MATCH_NEXT();
|
|
}
|
|
}
|
|
unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
|
|
|
|
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
|
|
if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) {
|
|
input.setPos(position);
|
|
BACKTRACK();
|
|
}
|
|
}
|
|
MATCH_NEXT();
|
|
}
|
|
case ByteTerm::TypePatternCharacterGreedy: {
|
|
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
|
|
unsigned matchAmount = 0;
|
|
unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
|
|
while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
|
|
if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
|
|
input.setPos(position);
|
|
break;
|
|
}
|
|
++matchAmount;
|
|
position = input.getPos();
|
|
}
|
|
backTrack->matchAmount = matchAmount;
|
|
|
|
MATCH_NEXT();
|
|
}
|
|
case ByteTerm::TypePatternCharacterNonGreedy: {
|
|
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
|
|
backTrack->begin = input.getPos();
|
|
backTrack->matchAmount = 0;
|
|
MATCH_NEXT();
|
|
}
|
|
|
|
case ByteTerm::TypePatternCasedCharacterOnce:
|
|
case ByteTerm::TypePatternCasedCharacterFixed: {
|
|
if (unicode) {
|
|
// Case insensitive matching of unicode characters is handled as TypeCharacterClass.
|
|
ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter));
|
|
|
|
unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
|
|
|
|
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
|
|
if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) {
|
|
input.setPos(position);
|
|
BACKTRACK();
|
|
}
|
|
}
|
|
MATCH_NEXT();
|
|
}
|
|
|
|
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
|
|
if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
|
|
BACKTRACK();
|
|
}
|
|
MATCH_NEXT();
|
|
}
|
|
case ByteTerm::TypePatternCasedCharacterGreedy: {
|
|
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
|
|
|
|
// Case insensitive matching of unicode characters is handled as TypeCharacterClass.
|
|
ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
|
|
|
|
unsigned matchAmount = 0;
|
|
while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
|
|
if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
|
|
input.uncheckInput(1);
|
|
break;
|
|
}
|
|
++matchAmount;
|
|
}
|
|
backTrack->matchAmount = matchAmount;
|
|
|
|
MATCH_NEXT();
|
|
}
|
|
case ByteTerm::TypePatternCasedCharacterNonGreedy: {
|
|
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
|
|
|
|
// Case insensitive matching of unicode characters is handled as TypeCharacterClass.
|
|
ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
|
|
|
|
backTrack->matchAmount = 0;
|
|
MATCH_NEXT();
|
|
}
|
|
|
|
case ByteTerm::TypeCharacterClass:
|
|
if (matchCharacterClass(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeBackReference:
|
|
if (matchBackReference(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParenthesesSubpattern: {
|
|
JSRegExpResult result = matchParentheses(currentTerm(), context);
|
|
|
|
if (result == JSRegExpMatch) {
|
|
MATCH_NEXT();
|
|
} else if (result != JSRegExpNoMatch)
|
|
return result;
|
|
|
|
BACKTRACK();
|
|
}
|
|
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
|
|
if (matchParenthesesOnceBegin(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParenthesesSubpatternOnceEnd:
|
|
if (matchParenthesesOnceEnd(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
|
|
if (matchParenthesesTerminalBegin(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
|
|
if (matchParenthesesTerminalEnd(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParentheticalAssertionBegin:
|
|
if (matchParentheticalAssertionBegin(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParentheticalAssertionEnd:
|
|
if (matchParentheticalAssertionEnd(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
|
|
case ByteTerm::TypeCheckInput:
|
|
if (input.checkInput(currentTerm().checkInputCount))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
|
|
case ByteTerm::TypeUncheckInput:
|
|
input.uncheckInput(currentTerm().checkInputCount);
|
|
MATCH_NEXT();
|
|
|
|
case ByteTerm::TypeDotStarEnclosure:
|
|
if (matchDotStarEnclosure(currentTerm(), context))
|
|
return JSRegExpMatch;
|
|
BACKTRACK();
|
|
}
|
|
|
|
// We should never fall-through to here.
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
|
|
backtrack:
|
|
ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
|
|
|
|
switch (currentTerm().type) {
|
|
case ByteTerm::TypeSubpatternBegin:
|
|
return JSRegExpNoMatch;
|
|
case ByteTerm::TypeSubpatternEnd:
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
|
|
case ByteTerm::TypeBodyAlternativeBegin:
|
|
case ByteTerm::TypeBodyAlternativeDisjunction: {
|
|
int offset = currentTerm().alternative.next;
|
|
context->term += offset;
|
|
if (offset > 0)
|
|
MATCH_NEXT();
|
|
|
|
if (input.atEnd() || pattern->sticky())
|
|
return JSRegExpNoMatch;
|
|
|
|
input.next();
|
|
|
|
context->matchBegin = input.getPos();
|
|
|
|
if (currentTerm().alternative.onceThrough)
|
|
context->term += currentTerm().alternative.next;
|
|
|
|
MATCH_NEXT();
|
|
}
|
|
case ByteTerm::TypeBodyAlternativeEnd:
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
|
|
case ByteTerm::TypeAlternativeBegin:
|
|
case ByteTerm::TypeAlternativeDisjunction: {
|
|
int offset = currentTerm().alternative.next;
|
|
context->term += offset;
|
|
if (offset > 0)
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
}
|
|
case ByteTerm::TypeAlternativeEnd: {
|
|
// We should never backtrack back into an alternative of the main body of the regex.
|
|
BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
|
|
unsigned offset = backTrack->offset;
|
|
context->term -= offset;
|
|
BACKTRACK();
|
|
}
|
|
|
|
case ByteTerm::TypeAssertionBOL:
|
|
case ByteTerm::TypeAssertionEOL:
|
|
case ByteTerm::TypeAssertionWordBoundary:
|
|
BACKTRACK();
|
|
|
|
case ByteTerm::TypePatternCharacterOnce:
|
|
case ByteTerm::TypePatternCharacterFixed:
|
|
case ByteTerm::TypePatternCharacterGreedy:
|
|
case ByteTerm::TypePatternCharacterNonGreedy:
|
|
if (backtrackPatternCharacter(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypePatternCasedCharacterOnce:
|
|
case ByteTerm::TypePatternCasedCharacterFixed:
|
|
case ByteTerm::TypePatternCasedCharacterGreedy:
|
|
case ByteTerm::TypePatternCasedCharacterNonGreedy:
|
|
if (backtrackPatternCasedCharacter(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeCharacterClass:
|
|
if (backtrackCharacterClass(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeBackReference:
|
|
if (backtrackBackReference(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParenthesesSubpattern: {
|
|
JSRegExpResult result = backtrackParentheses(currentTerm(), context);
|
|
|
|
if (result == JSRegExpMatch) {
|
|
MATCH_NEXT();
|
|
} else if (result != JSRegExpNoMatch)
|
|
return result;
|
|
|
|
BACKTRACK();
|
|
}
|
|
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
|
|
if (backtrackParenthesesOnceBegin(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParenthesesSubpatternOnceEnd:
|
|
if (backtrackParenthesesOnceEnd(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
|
|
if (backtrackParenthesesTerminalBegin(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
|
|
if (backtrackParenthesesTerminalEnd(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParentheticalAssertionBegin:
|
|
if (backtrackParentheticalAssertionBegin(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
case ByteTerm::TypeParentheticalAssertionEnd:
|
|
if (backtrackParentheticalAssertionEnd(currentTerm(), context))
|
|
MATCH_NEXT();
|
|
BACKTRACK();
|
|
|
|
case ByteTerm::TypeCheckInput:
|
|
input.uncheckInput(currentTerm().checkInputCount);
|
|
BACKTRACK();
|
|
|
|
case ByteTerm::TypeUncheckInput:
|
|
input.checkInput(currentTerm().checkInputCount);
|
|
BACKTRACK();
|
|
|
|
case ByteTerm::TypeDotStarEnclosure:
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
}
|
|
|
|
RELEASE_ASSERT_NOT_REACHED();
|
|
return JSRegExpErrorNoMatch;
|
|
}
|
|
|
|
JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
|
|
{
|
|
JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
|
|
|
|
if (result == JSRegExpMatch) {
|
|
while (context->matchBegin == context->matchEnd) {
|
|
result = matchDisjunction(disjunction, context, true);
|
|
if (result != JSRegExpMatch)
|
|
return result;
|
|
}
|
|
return JSRegExpMatch;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
unsigned interpret()
|
|
{
|
|
// FIXME: https://bugs.webkit.org/show_bug.cgi?id=195970
|
|
// [Yarr Interpreter] The interpreter doesn't have checks for stack overflow due to deep recursion
|
|
if (!input.isAvailableInput(0))
|
|
return offsetNoMatch;
|
|
|
|
if (pattern->m_lock)
|
|
pattern->m_lock->lock();
|
|
|
|
for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
|
|
output[i << 1] = offsetNoMatch;
|
|
|
|
allocatorPool = pattern->m_allocator->startAllocator();
|
|
RELEASE_ASSERT(allocatorPool);
|
|
|
|
DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
|
|
|
|
JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
|
|
if (result == JSRegExpMatch) {
|
|
output[0] = context->matchBegin;
|
|
output[1] = context->matchEnd;
|
|
}
|
|
|
|
freeDisjunctionContext(context);
|
|
|
|
pattern->m_allocator->stopAllocator();
|
|
|
|
ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
|
|
|
|
if (pattern->m_lock)
|
|
pattern->m_lock->unlock();
|
|
|
|
return output[0];
|
|
}
|
|
|
|
Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
|
|
: pattern(pattern)
|
|
, unicode(pattern->unicode())
|
|
, output(output)
|
|
, input(input, start, length, pattern->unicode())
|
|
, startOffset(start)
|
|
, remainingMatchCount(matchLimit)
|
|
{
|
|
}
|
|
|
|
private:
|
|
inline bool isSafeToRecurse() { return m_stackCheck.isSafeToRecurse(); }
|
|
|
|
BytecodePattern* pattern;
|
|
bool unicode;
|
|
unsigned* output;
|
|
InputStream input;
|
|
StackCheck m_stackCheck;
|
|
WTF::BumpPointerPool* allocatorPool { nullptr };
|
|
unsigned startOffset;
|
|
unsigned remainingMatchCount;
|
|
};
|
|
|
|
class ByteCompiler {
|
|
struct ParenthesesStackEntry {
|
|
unsigned beginTerm;
|
|
unsigned savedAlternativeIndex;
|
|
ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
|
|
: beginTerm(beginTerm)
|
|
, savedAlternativeIndex(savedAlternativeIndex)
|
|
{
|
|
}
|
|
};
|
|
|
|
public:
|
|
ByteCompiler(YarrPattern& pattern)
|
|
: m_pattern(pattern)
|
|
{
|
|
}
|
|
|
|
std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock, ErrorCode& errorCode)
|
|
{
|
|
if (UNLIKELY(!isSafeToRecurse())) {
|
|
errorCode = ErrorCode::TooManyDisjunctions;
|
|
return nullptr;
|
|
}
|
|
|
|
regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
|
|
if (auto error = emitDisjunction(m_pattern.m_body, 0, 0)) {
|
|
errorCode = error.value();
|
|
return nullptr;
|
|
}
|
|
regexEnd();
|
|
|
|
#ifndef NDEBUG
|
|
if (Options::dumpCompiledRegExpPatterns())
|
|
dumpDisjunction(m_bodyDisjunction.get());
|
|
#endif
|
|
|
|
return makeUnique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock);
|
|
}
|
|
|
|
void checkInput(unsigned count)
|
|
{
|
|
m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
|
|
}
|
|
|
|
void uncheckInput(unsigned count)
|
|
{
|
|
m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
|
|
}
|
|
|
|
void assertionBOL(unsigned inputPosition)
|
|
{
|
|
m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
|
|
}
|
|
|
|
void assertionEOL(unsigned inputPosition)
|
|
{
|
|
m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
|
|
}
|
|
|
|
void assertionWordBoundary(bool invert, unsigned inputPosition)
|
|
{
|
|
m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
|
|
}
|
|
|
|
void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
|
|
{
|
|
if (m_pattern.ignoreCase()) {
|
|
UChar32 lo = u_tolower(ch);
|
|
UChar32 hi = u_toupper(ch);
|
|
|
|
if (lo != hi) {
|
|
m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType));
|
|
return;
|
|
}
|
|
}
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType));
|
|
}
|
|
|
|
void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
|
|
{
|
|
m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
|
|
|
|
m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms.last().atom.quantityType = quantityType;
|
|
m_bodyDisjunction->terms.last().frameLocation = frameLocation;
|
|
}
|
|
|
|
void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
|
|
{
|
|
ASSERT(subpatternId);
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
|
|
|
|
m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms.last().atom.quantityType = quantityType;
|
|
m_bodyDisjunction->terms.last().frameLocation = frameLocation;
|
|
}
|
|
|
|
void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
|
|
{
|
|
unsigned beginTerm = m_bodyDisjunction->terms.size();
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
|
|
m_bodyDisjunction->terms.last().frameLocation = frameLocation;
|
|
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
|
|
m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
|
|
|
|
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
|
|
m_currentAlternativeIndex = beginTerm + 1;
|
|
}
|
|
|
|
void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
|
|
{
|
|
unsigned beginTerm = m_bodyDisjunction->terms.size();
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
|
|
m_bodyDisjunction->terms.last().frameLocation = frameLocation;
|
|
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
|
|
m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
|
|
|
|
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
|
|
m_currentAlternativeIndex = beginTerm + 1;
|
|
}
|
|
|
|
void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
|
|
{
|
|
// Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
|
|
// then fix this up at the end! - simplifying this should make it much clearer.
|
|
// https://bugs.webkit.org/show_bug.cgi?id=50136
|
|
|
|
unsigned beginTerm = m_bodyDisjunction->terms.size();
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
|
|
m_bodyDisjunction->terms.last().frameLocation = frameLocation;
|
|
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
|
|
m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
|
|
|
|
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
|
|
m_currentAlternativeIndex = beginTerm + 1;
|
|
}
|
|
|
|
void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
|
|
{
|
|
unsigned beginTerm = m_bodyDisjunction->terms.size();
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
|
|
m_bodyDisjunction->terms.last().frameLocation = frameLocation;
|
|
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
|
|
m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
|
|
|
|
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
|
|
m_currentAlternativeIndex = beginTerm + 1;
|
|
}
|
|
|
|
void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
|
|
{
|
|
unsigned beginTerm = popParenthesesStack();
|
|
closeAlternative(beginTerm + 1);
|
|
unsigned endTerm = m_bodyDisjunction->terms.size();
|
|
|
|
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
|
|
|
|
bool invert = m_bodyDisjunction->terms[beginTerm].invert();
|
|
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
|
|
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
|
|
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
|
|
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
|
|
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
|
|
m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
|
|
}
|
|
|
|
void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
|
|
{
|
|
m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
|
|
}
|
|
|
|
unsigned popParenthesesStack()
|
|
{
|
|
ASSERT(m_parenthesesStack.size());
|
|
unsigned beginTerm = m_parenthesesStack.last().beginTerm;
|
|
m_currentAlternativeIndex = m_parenthesesStack.last().savedAlternativeIndex;
|
|
m_parenthesesStack.removeLast();
|
|
|
|
ASSERT(beginTerm < m_bodyDisjunction->terms.size());
|
|
ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
|
|
|
|
return beginTerm;
|
|
}
|
|
|
|
void closeAlternative(unsigned beginTerm)
|
|
{
|
|
unsigned origBeginTerm = beginTerm;
|
|
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
|
|
unsigned endIndex = m_bodyDisjunction->terms.size();
|
|
|
|
unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
|
|
|
|
if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
|
|
m_bodyDisjunction->terms.remove(beginTerm);
|
|
else {
|
|
while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
|
|
beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
|
|
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
|
|
m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
|
|
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
|
|
}
|
|
|
|
m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
|
|
m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
|
|
}
|
|
}
|
|
|
|
void closeBodyAlternative()
|
|
{
|
|
unsigned beginTerm = 0;
|
|
unsigned origBeginTerm = 0;
|
|
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
|
|
unsigned endIndex = m_bodyDisjunction->terms.size();
|
|
|
|
unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
|
|
|
|
while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
|
|
beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
|
|
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
|
|
m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
|
|
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
|
|
}
|
|
|
|
m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
|
|
m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
|
|
}
|
|
|
|
void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0)
|
|
{
|
|
unsigned beginTerm = popParenthesesStack();
|
|
closeAlternative(beginTerm + 1);
|
|
unsigned endTerm = m_bodyDisjunction->terms.size();
|
|
|
|
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
|
|
|
|
ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
|
|
|
|
bool capture = parenthesesBegin.capture();
|
|
unsigned subpatternId = parenthesesBegin.atom.subpatternId;
|
|
|
|
unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
|
|
auto parenthesesDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize);
|
|
|
|
unsigned firstTermInParentheses = beginTerm + 1;
|
|
parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
|
|
|
|
parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
|
|
for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
|
|
parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
|
|
parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
|
|
|
|
m_bodyDisjunction->terms.shrink(beginTerm);
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
|
|
m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction));
|
|
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
|
|
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
|
|
}
|
|
|
|
void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
|
|
{
|
|
unsigned beginTerm = popParenthesesStack();
|
|
closeAlternative(beginTerm + 1);
|
|
unsigned endTerm = m_bodyDisjunction->terms.size();
|
|
|
|
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
|
|
|
|
bool capture = m_bodyDisjunction->terms[beginTerm].capture();
|
|
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
|
|
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
|
|
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
|
|
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
|
|
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
|
|
m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
|
|
m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
|
|
}
|
|
|
|
void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
|
|
{
|
|
unsigned beginTerm = popParenthesesStack();
|
|
closeAlternative(beginTerm + 1);
|
|
unsigned endTerm = m_bodyDisjunction->terms.size();
|
|
|
|
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
|
|
|
|
bool capture = m_bodyDisjunction->terms[beginTerm].capture();
|
|
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
|
|
|
|
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
|
|
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
|
|
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
|
|
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
|
|
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
|
|
m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
|
|
m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
|
|
m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
|
|
}
|
|
|
|
void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
|
|
{
|
|
m_bodyDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize);
|
|
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
|
|
m_bodyDisjunction->terms[0].frameLocation = 0;
|
|
m_currentAlternativeIndex = 0;
|
|
}
|
|
|
|
void regexEnd()
|
|
{
|
|
closeBodyAlternative();
|
|
}
|
|
|
|
void alternativeBodyDisjunction(bool onceThrough)
|
|
{
|
|
unsigned newAlternativeIndex = m_bodyDisjunction->terms.size();
|
|
m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
|
|
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
|
|
|
|
m_currentAlternativeIndex = newAlternativeIndex;
|
|
}
|
|
|
|
void alternativeDisjunction()
|
|
{
|
|
unsigned newAlternativeIndex = m_bodyDisjunction->terms.size();
|
|
m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
|
|
m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
|
|
|
|
m_currentAlternativeIndex = newAlternativeIndex;
|
|
}
|
|
|
|
Optional<ErrorCode> WARN_UNUSED_RETURN emitDisjunction(PatternDisjunction* disjunction, Checked<unsigned, RecordOverflow> inputCountAlreadyChecked, unsigned parenthesesInputCountAlreadyChecked)
|
|
{
|
|
if (UNLIKELY(!isSafeToRecurse()))
|
|
return ErrorCode::TooManyDisjunctions;
|
|
|
|
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
|
|
auto currentCountAlreadyChecked = inputCountAlreadyChecked;
|
|
|
|
PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
|
|
|
|
if (alt) {
|
|
if (disjunction == m_pattern.m_body)
|
|
alternativeBodyDisjunction(alternative->onceThrough());
|
|
else
|
|
alternativeDisjunction();
|
|
}
|
|
|
|
unsigned minimumSize = alternative->m_minimumSize;
|
|
ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
|
|
unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
|
|
|
|
if (countToCheck) {
|
|
checkInput(countToCheck);
|
|
currentCountAlreadyChecked += countToCheck;
|
|
if (currentCountAlreadyChecked.hasOverflowed())
|
|
return ErrorCode::OffsetTooLarge;
|
|
}
|
|
|
|
for (auto& term : alternative->m_terms) {
|
|
switch (term.type) {
|
|
case PatternTerm::TypeAssertionBOL:
|
|
assertionBOL((currentCountAlreadyChecked - term.inputPosition).unsafeGet());
|
|
break;
|
|
|
|
case PatternTerm::TypeAssertionEOL:
|
|
assertionEOL((currentCountAlreadyChecked - term.inputPosition).unsafeGet());
|
|
break;
|
|
|
|
case PatternTerm::TypeAssertionWordBoundary:
|
|
assertionWordBoundary(term.invert(), (currentCountAlreadyChecked - term.inputPosition).unsafeGet());
|
|
break;
|
|
|
|
case PatternTerm::TypePatternCharacter:
|
|
atomPatternCharacter(term.patternCharacter, (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
|
|
break;
|
|
|
|
case PatternTerm::TypeCharacterClass:
|
|
atomCharacterClass(term.characterClass, term.invert(), (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
|
|
break;
|
|
|
|
case PatternTerm::TypeBackReference:
|
|
atomBackReference(term.backReferenceSubpatternId, (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
|
|
break;
|
|
|
|
case PatternTerm::TypeForwardReference:
|
|
break;
|
|
|
|
case PatternTerm::TypeParenthesesSubpattern: {
|
|
unsigned disjunctionAlreadyCheckedCount = 0;
|
|
if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
|
|
unsigned alternativeFrameLocation = term.frameLocation;
|
|
// For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
|
|
if (term.quantityType == QuantifierFixedCount)
|
|
disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
|
|
else
|
|
alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
|
|
unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
|
|
atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
|
|
if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount))
|
|
return error;
|
|
atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
|
|
} else if (term.parentheses.isTerminal) {
|
|
unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
|
|
atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesTerminal);
|
|
if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount))
|
|
return error;
|
|
atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
|
|
} else {
|
|
unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
|
|
atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0);
|
|
if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0))
|
|
return error;
|
|
atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case PatternTerm::TypeParentheticalAssertion: {
|
|
unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
|
|
unsigned positiveInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
|
|
unsigned uncheckAmount = 0;
|
|
if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
|
|
uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
|
|
uncheckInput(uncheckAmount);
|
|
currentCountAlreadyChecked -= uncheckAmount;
|
|
if (currentCountAlreadyChecked.hasOverflowed())
|
|
return ErrorCode::OffsetTooLarge;
|
|
}
|
|
|
|
atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
|
|
if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount))
|
|
return error;
|
|
atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType);
|
|
if (uncheckAmount) {
|
|
checkInput(uncheckAmount);
|
|
currentCountAlreadyChecked += uncheckAmount;
|
|
if (currentCountAlreadyChecked.hasOverflowed())
|
|
return ErrorCode::OffsetTooLarge;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case PatternTerm::TypeDotStarEnclosure:
|
|
assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return WTF::nullopt;
|
|
}
|
|
#ifndef NDEBUG
|
|
void dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting = 0)
|
|
{
|
|
PrintStream& out = WTF::dataFile();
|
|
|
|
unsigned termIndexNest = 0;
|
|
|
|
if (!nesting) {
|
|
out.printf("ByteDisjunction(%p):\n", disjunction);
|
|
nesting = 1;
|
|
} else {
|
|
termIndexNest = nesting - 1;
|
|
nesting = 2;
|
|
}
|
|
|
|
auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) {
|
|
for (unsigned nestingDepth = 0; nestingDepth < termIndexNest; nestingDepth++)
|
|
out.print(" ");
|
|
out.printf("%4zu", index);
|
|
for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++)
|
|
out.print(" ");
|
|
};
|
|
|
|
auto dumpQuantity = [&](ByteTerm& term) {
|
|
if (term.atom.quantityType == QuantifierFixedCount && term.atom.quantityMinCount == 1 && term.atom.quantityMaxCount == 1)
|
|
return;
|
|
|
|
out.print(" {", term.atom.quantityMinCount);
|
|
if (term.atom.quantityMinCount != term.atom.quantityMaxCount) {
|
|
if (term.atom.quantityMaxCount == UINT_MAX)
|
|
out.print(",inf");
|
|
else
|
|
out.print(",", term.atom.quantityMaxCount);
|
|
}
|
|
out.print("}");
|
|
if (term.atom.quantityType == QuantifierGreedy)
|
|
out.print(" greedy");
|
|
else if (term.atom.quantityType == QuantifierNonGreedy)
|
|
out.print(" non-greedy");
|
|
};
|
|
|
|
auto dumpCaptured = [&](ByteTerm& term) {
|
|
if (term.capture())
|
|
out.print(" captured (#", term.atom.subpatternId, ")");
|
|
};
|
|
|
|
auto dumpInverted = [&](ByteTerm& term) {
|
|
if (term.invert())
|
|
out.print(" inverted");
|
|
};
|
|
|
|
auto dumpInputPosition = [&](ByteTerm& term) {
|
|
out.printf(" inputPosition %u", term.inputPosition);
|
|
};
|
|
|
|
auto dumpFrameLocation = [&](ByteTerm& term) {
|
|
out.printf(" frameLocation %u", term.frameLocation);
|
|
};
|
|
|
|
auto dumpCharacter = [&](ByteTerm& term) {
|
|
out.print(" ");
|
|
dumpUChar32(out, term.atom.patternCharacter);
|
|
};
|
|
|
|
auto dumpCharClass = [&](ByteTerm& term) {
|
|
out.print(" ");
|
|
dumpCharacterClass(out, &m_pattern, term.atom.characterClass);
|
|
};
|
|
|
|
for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) {
|
|
ByteTerm term = disjunction->terms[idx];
|
|
|
|
bool outputNewline = true;
|
|
|
|
switch (term.type) {
|
|
case ByteTerm::TypeBodyAlternativeBegin:
|
|
outputTermIndexAndNest(idx, nesting++);
|
|
out.print("BodyAlternativeBegin");
|
|
if (term.alternative.onceThrough)
|
|
out.print(" onceThrough");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeBodyAlternativeDisjunction:
|
|
outputTermIndexAndNest(idx, nesting - 1);
|
|
out.print("BodyAlternativeDisjunction");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeBodyAlternativeEnd:
|
|
outputTermIndexAndNest(idx, --nesting);
|
|
out.print("BodyAlternativeEnd");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeAlternativeBegin:
|
|
outputTermIndexAndNest(idx, nesting++);
|
|
out.print("AlternativeBegin");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeAlternativeDisjunction:
|
|
outputTermIndexAndNest(idx, nesting - 1);
|
|
out.print("AlternativeDisjunction");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeAlternativeEnd:
|
|
outputTermIndexAndNest(idx, --nesting);
|
|
out.print("AlternativeEnd");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeSubpatternBegin:
|
|
outputTermIndexAndNest(idx, nesting++);
|
|
out.print("SubpatternBegin");
|
|
break;
|
|
case ByteTerm::TypeSubpatternEnd:
|
|
outputTermIndexAndNest(idx, --nesting);
|
|
out.print("SubpatternEnd");
|
|
break;
|
|
case ByteTerm::TypeAssertionBOL:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("AssertionBOL");
|
|
break;
|
|
case ByteTerm::TypeAssertionEOL:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("AssertionEOL");
|
|
break;
|
|
case ByteTerm::TypeAssertionWordBoundary:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("AssertionWordBoundary");
|
|
break;
|
|
case ByteTerm::TypePatternCharacterOnce:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("PatternCharacterOnce");
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
dumpCharacter(term);
|
|
dumpQuantity(term);
|
|
break;
|
|
case ByteTerm::TypePatternCharacterFixed:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("PatternCharacterFixed");
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
dumpCharacter(term);
|
|
out.print(" {", term.atom.quantityMinCount, "}");
|
|
break;
|
|
case ByteTerm::TypePatternCharacterGreedy:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("PatternCharacterGreedy");
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
dumpCharacter(term);
|
|
dumpQuantity(term);
|
|
break;
|
|
case ByteTerm::TypePatternCharacterNonGreedy:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("PatternCharacterNonGreedy");
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
dumpCharacter(term);
|
|
dumpQuantity(term);
|
|
break;
|
|
case ByteTerm::TypePatternCasedCharacterOnce:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("PatternCasedCharacterOnce");
|
|
break;
|
|
case ByteTerm::TypePatternCasedCharacterFixed:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("PatternCasedCharacterFixed");
|
|
break;
|
|
case ByteTerm::TypePatternCasedCharacterGreedy:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("PatternCasedCharacterGreedy");
|
|
break;
|
|
case ByteTerm::TypePatternCasedCharacterNonGreedy:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("PatternCasedCharacterNonGreedy");
|
|
break;
|
|
case ByteTerm::TypeCharacterClass:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("CharacterClass");
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
dumpCharClass(term);
|
|
dumpQuantity(term);
|
|
break;
|
|
case ByteTerm::TypeBackReference:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("BackReference #", term.atom.subpatternId);
|
|
dumpQuantity(term);
|
|
break;
|
|
case ByteTerm::TypeParenthesesSubpattern:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("ParenthesesSubpattern");
|
|
dumpCaptured(term);
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
dumpQuantity(term);
|
|
out.print("\n");
|
|
outputNewline = false;
|
|
dumpDisjunction(term.atom.parenthesesDisjunction, nesting);
|
|
break;
|
|
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
|
|
outputTermIndexAndNest(idx, nesting++);
|
|
out.print("ParenthesesSubpatternOnceBegin");
|
|
dumpCaptured(term);
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeParenthesesSubpatternOnceEnd:
|
|
outputTermIndexAndNest(idx, --nesting);
|
|
out.print("ParenthesesSubpatternOnceEnd");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
|
|
outputTermIndexAndNest(idx, nesting++);
|
|
out.print("ParenthesesSubpatternTerminalBegin");
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
|
|
outputTermIndexAndNest(idx, --nesting);
|
|
out.print("ParenthesesSubpatternTerminalEnd");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeParentheticalAssertionBegin:
|
|
outputTermIndexAndNest(idx, nesting++);
|
|
out.print("ParentheticalAssertionBegin");
|
|
dumpInverted(term);
|
|
dumpInputPosition(term);
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeParentheticalAssertionEnd:
|
|
outputTermIndexAndNest(idx, --nesting);
|
|
out.print("ParentheticalAssertionEnd");
|
|
dumpFrameLocation(term);
|
|
break;
|
|
case ByteTerm::TypeCheckInput:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("CheckInput ", term.checkInputCount);
|
|
break;
|
|
case ByteTerm::TypeUncheckInput:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("UncheckInput ", term.checkInputCount);
|
|
break;
|
|
case ByteTerm::TypeDotStarEnclosure:
|
|
outputTermIndexAndNest(idx, nesting);
|
|
out.print("DotStarEnclosure");
|
|
break;
|
|
}
|
|
if (outputNewline)
|
|
out.print("\n");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
private:
|
|
inline bool isSafeToRecurse() { return m_stackCheck.isSafeToRecurse(); }
|
|
|
|
YarrPattern& m_pattern;
|
|
std::unique_ptr<ByteDisjunction> m_bodyDisjunction;
|
|
StackCheck m_stackCheck;
|
|
unsigned m_currentAlternativeIndex { 0 };
|
|
Vector<ParenthesesStackEntry> m_parenthesesStack;
|
|
Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
|
|
};
|
|
|
|
std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ErrorCode& errorCode, ConcurrentJSLock* lock)
|
|
{
|
|
return ByteCompiler(pattern).compile(allocator, lock, errorCode);
|
|
}
|
|
|
|
unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
|
|
{
|
|
SuperSamplerScope superSamplerScope(false);
|
|
if (input.is8Bit())
|
|
return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
|
|
return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
|
|
}
|
|
|
|
unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
|
|
{
|
|
SuperSamplerScope superSamplerScope(false);
|
|
return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
|
|
}
|
|
|
|
unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
|
|
{
|
|
SuperSamplerScope superSamplerScope(false);
|
|
return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
|
|
}
|
|
|
|
// These should be the same for both UChar & LChar.
|
|
COMPILE_ASSERT(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
|
|
COMPILE_ASSERT(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
|
|
COMPILE_ASSERT(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
|
|
COMPILE_ASSERT(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
|
|
COMPILE_ASSERT(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
|
|
COMPILE_ASSERT(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
|
|
COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
|
|
|
|
|
|
} }
|