darling-JavaScriptCore/yarr/YarrPattern.cpp

1461 lines
57 KiB
C++
Raw Permalink Normal View History

2017-08-12 16:48:01 +00:00
/*
* Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
* Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "YarrPattern.h"
2020-08-29 13:27:11 +00:00
#include "Options.h"
2017-08-12 16:48:01 +00:00
#include "Yarr.h"
#include "YarrCanonicalize.h"
#include "YarrParser.h"
2020-08-29 13:27:11 +00:00
#include <wtf/DataLog.h>
#include <wtf/Optional.h>
#include <wtf/StackCheck.h>
2017-08-12 16:48:01 +00:00
#include <wtf/Vector.h>
namespace JSC { namespace Yarr {
#include "RegExpJitTables.h"
class CharacterClassConstructor {
public:
CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
: m_isCaseInsensitive(isCaseInsensitive)
2020-08-29 13:27:11 +00:00
, m_anyCharacter(false)
, m_characterWidths(CharacterClassWidths::Unknown)
2017-08-12 16:48:01 +00:00
, m_canonicalMode(canonicalMode)
{
}
void reset()
{
m_matches.clear();
m_ranges.clear();
m_matchesUnicode.clear();
m_rangesUnicode.clear();
2020-08-29 13:27:11 +00:00
m_anyCharacter = false;
m_characterWidths = CharacterClassWidths::Unknown;
2017-08-12 16:48:01 +00:00
}
void append(const CharacterClass* other)
{
for (size_t i = 0; i < other->m_matches.size(); ++i)
addSorted(m_matches, other->m_matches[i]);
for (size_t i = 0; i < other->m_ranges.size(); ++i)
addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
}
2020-08-29 13:27:11 +00:00
void appendInverted(const CharacterClass* other)
{
auto addSortedInverted = [&](UChar32 min, UChar32 max,
const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
if (lo < hiPlusOne) {
if (lo + 1 == hiPlusOne)
addSorted(destMatches, lo);
else
addSortedRange(destRanges, lo, hiPlusOne - 1);
}
};
UChar32 lo = min;
size_t matchesIndex = 0;
size_t rangesIndex = 0;
bool matchesRemaining = matchesIndex < srcMatches.size();
bool rangesRemaining = rangesIndex < srcRanges.size();
if (!matchesRemaining && !rangesRemaining) {
addSortedMatchOrRange(min, max + 1);
return;
}
while (matchesRemaining || rangesRemaining) {
UChar32 hiPlusOne;
UChar32 nextLo;
if (matchesRemaining
&& (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) {
hiPlusOne = srcMatches[matchesIndex];
nextLo = hiPlusOne + 1;
++matchesIndex;
matchesRemaining = matchesIndex < srcMatches.size();
} else {
hiPlusOne = srcRanges[rangesIndex].begin;
nextLo = srcRanges[rangesIndex].end + 1;
++rangesIndex;
rangesRemaining = rangesIndex < srcRanges.size();
}
addSortedMatchOrRange(lo, hiPlusOne);
lo = nextLo;
}
addSortedMatchOrRange(lo, max + 1);
};
addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges);
addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
}
2017-08-12 16:48:01 +00:00
void putChar(UChar32 ch)
{
if (!m_isCaseInsensitive) {
addSorted(ch);
return;
}
if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
// Handle ASCII cases.
if (isASCIIAlpha(ch)) {
addSorted(m_matches, toASCIIUpper(ch));
addSorted(m_matches, toASCIILower(ch));
} else
addSorted(m_matches, ch);
return;
}
// Add multiple matches, if necessary.
const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
if (info->type == CanonicalizeUnique)
addSorted(ch);
else
putUnicodeIgnoreCase(ch, info);
}
void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
{
ASSERT(m_isCaseInsensitive);
ASSERT(ch >= info->begin && ch <= info->end);
ASSERT(info->type != CanonicalizeUnique);
if (info->type == CanonicalizeSet) {
for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
addSorted(ch);
} else {
addSorted(ch);
addSorted(getCanonicalPair(info, ch));
}
}
void putRange(UChar32 lo, UChar32 hi)
{
if (isASCII(lo)) {
char asciiLo = lo;
char asciiHi = std::min(hi, (UChar32)0x7f);
addSortedRange(m_ranges, lo, asciiHi);
if (m_isCaseInsensitive) {
if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
if ((asciiLo <= 'z') && (asciiHi >= 'a'))
addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
}
}
if (isASCII(hi))
return;
lo = std::max(lo, (UChar32)0x80);
addSortedRange(m_rangesUnicode, lo, hi);
if (!m_isCaseInsensitive)
return;
const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
while (true) {
// Handle the range [lo .. end]
UChar32 end = std::min<UChar32>(info->end, hi);
switch (info->type) {
case CanonicalizeUnique:
// Nothing to do - no canonical equivalents.
break;
case CanonicalizeSet: {
UChar ch;
for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
addSorted(m_matchesUnicode, ch);
break;
}
case CanonicalizeRangeLo:
addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
break;
case CanonicalizeRangeHi:
addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
break;
case CanonicalizeAlternatingAligned:
// Use addSortedRange since there is likely an abutting range to combine with.
if (lo & 1)
addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
if (!(end & 1))
addSortedRange(m_rangesUnicode, end + 1, end + 1);
break;
case CanonicalizeAlternatingUnaligned:
// Use addSortedRange since there is likely an abutting range to combine with.
if (!(lo & 1))
addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
if (end & 1)
addSortedRange(m_rangesUnicode, end + 1, end + 1);
break;
}
if (hi == end)
return;
++info;
lo = info->begin;
};
}
std::unique_ptr<CharacterClass> charClass()
{
2020-08-29 13:27:11 +00:00
coalesceTables();
auto characterClass = makeUnique<CharacterClass>();
2017-08-12 16:48:01 +00:00
characterClass->m_matches.swap(m_matches);
characterClass->m_ranges.swap(m_ranges);
characterClass->m_matchesUnicode.swap(m_matchesUnicode);
characterClass->m_rangesUnicode.swap(m_rangesUnicode);
2020-08-29 13:27:11 +00:00
characterClass->m_anyCharacter = anyCharacter();
characterClass->m_characterWidths = characterWidths();
m_anyCharacter = false;
m_characterWidths = CharacterClassWidths::Unknown;
2017-08-12 16:48:01 +00:00
return characterClass;
}
private:
void addSorted(UChar32 ch)
{
addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
}
void addSorted(Vector<UChar32>& matches, UChar32 ch)
{
unsigned pos = 0;
unsigned range = matches.size();
2020-08-29 13:27:11 +00:00
m_characterWidths |= (U_IS_BMP(ch) ? CharacterClassWidths::HasBMPChars : CharacterClassWidths::HasNonBMPChars);
2017-08-12 16:48:01 +00:00
// binary chop, find position to insert char.
while (range) {
unsigned index = range >> 1;
int val = matches[pos+index] - ch;
if (!val)
return;
2020-08-29 13:27:11 +00:00
else if (val > 0) {
if (val == 1) {
UChar32 lo = ch;
UChar32 hi = ch + 1;
matches.remove(pos + index);
if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
lo = ch - 1;
matches.remove(pos + index - 1);
}
addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
return;
}
2017-08-12 16:48:01 +00:00
range = index;
2020-08-29 13:27:11 +00:00
} else {
if (val == -1) {
UChar32 lo = ch - 1;
UChar32 hi = ch;
matches.remove(pos + index);
if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
hi = ch + 1;
matches.remove(pos + index + 1);
}
addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
return;
}
2017-08-12 16:48:01 +00:00
pos += (index+1);
range -= (index+1);
}
}
if (pos == matches.size())
matches.append(ch);
else
matches.insert(pos, ch);
}
void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
{
2020-08-29 13:27:11 +00:00
size_t end = ranges.size();
if (U_IS_BMP(lo))
m_characterWidths |= CharacterClassWidths::HasBMPChars;
if (!U_IS_BMP(hi))
m_characterWidths |= CharacterClassWidths::HasNonBMPChars;
2017-08-12 16:48:01 +00:00
// Simple linear scan - I doubt there are that many ranges anyway...
// feel free to fix this with something faster (eg binary chop).
2020-08-29 13:27:11 +00:00
for (size_t i = 0; i < end; ++i) {
2017-08-12 16:48:01 +00:00
// does the new range fall before the current position in the array
if (hi < ranges[i].begin) {
2020-08-29 13:27:11 +00:00
// Concatenate appending ranges.
2017-08-12 16:48:01 +00:00
if (hi == (ranges[i].begin - 1)) {
ranges[i].begin = lo;
return;
}
ranges.insert(i, CharacterRange(lo, hi));
return;
}
// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
// end of the last range they concatenate, which is just as good.
if (lo <= (ranges[i].end + 1)) {
// found an intersect! we'll replace this entry in the array.
ranges[i].begin = std::min(ranges[i].begin, lo);
ranges[i].end = std::max(ranges[i].end, hi);
2020-08-29 13:27:11 +00:00
mergeRangesFrom(ranges, i);
2017-08-12 16:48:01 +00:00
return;
}
}
// CharacterRange comes after all existing ranges.
ranges.append(CharacterRange(lo, hi));
}
2020-08-29 13:27:11 +00:00
void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
{
unsigned next = index + 1;
// each iteration of the loop we will either remove something from the list, or break out of the loop.
while (next < ranges.size()) {
if (ranges[next].begin <= (ranges[index].end + 1)) {
// the next entry now overlaps / concatenates with this one.
ranges[index].end = std::max(ranges[index].end, ranges[next].end);
ranges.remove(next);
} else
break;
}
}
void coalesceTables()
{
auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
size_t matchesIndex = 0;
size_t rangesIndex = 0;
while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
matchesIndex++;
if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
ranges[rangesIndex].begin = matches[matchesIndex];
matches.remove(matchesIndex);
}
while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
matchesIndex++;
if (matchesIndex < matches.size()) {
if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
ranges[rangesIndex].end = matches[matchesIndex];
matches.remove(matchesIndex);
mergeRangesFrom(ranges, rangesIndex);
} else
matchesIndex++;
}
}
};
coalesceMatchesAndRanges(m_matches, m_ranges);
coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
if (!m_matches.size() && !m_matchesUnicode.size()
&& m_ranges.size() == 1 && m_rangesUnicode.size() == 1
&& m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
&& m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
m_anyCharacter = true;
}
bool hasNonBMPCharacters()
{
return m_characterWidths & CharacterClassWidths::HasNonBMPChars;
}
CharacterClassWidths characterWidths()
{
return m_characterWidths;
}
bool anyCharacter()
{
return m_anyCharacter;
}
bool m_isCaseInsensitive : 1;
bool m_anyCharacter : 1;
CharacterClassWidths m_characterWidths;
2017-08-12 16:48:01 +00:00
CanonicalMode m_canonicalMode;
Vector<UChar32> m_matches;
Vector<CharacterRange> m_ranges;
Vector<UChar32> m_matchesUnicode;
Vector<CharacterRange> m_rangesUnicode;
};
class YarrPatternConstructor {
public:
YarrPatternConstructor(YarrPattern& pattern)
2017-08-12 16:48:01 +00:00
: m_pattern(pattern)
, m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
{
auto body = makeUnique<PatternDisjunction>();
2017-08-12 16:48:01 +00:00
m_pattern.m_body = body.get();
m_alternative = body->addNewAlternative();
m_pattern.m_disjunctions.append(WTFMove(body));
}
~YarrPatternConstructor()
{
}
2020-08-29 13:27:11 +00:00
void resetForReparsing()
2017-08-12 16:48:01 +00:00
{
2020-08-29 13:27:11 +00:00
m_pattern.resetForReparsing();
2017-08-12 16:48:01 +00:00
m_characterClassConstructor.reset();
auto body = makeUnique<PatternDisjunction>();
2017-08-12 16:48:01 +00:00
m_pattern.m_body = body.get();
m_alternative = body->addNewAlternative();
m_pattern.m_disjunctions.append(WTFMove(body));
}
2020-08-29 13:27:11 +00:00
2017-08-12 16:48:01 +00:00
void assertionBOL()
{
if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
m_alternative->m_startsWithBOL = true;
m_alternative->m_containsBOL = true;
m_pattern.m_containsBOL = true;
}
m_alternative->m_terms.append(PatternTerm::BOL());
}
void assertionEOL()
{
m_alternative->m_terms.append(PatternTerm::EOL());
}
void assertionWordBoundary(bool invert)
{
m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
}
void atomPatternCharacter(UChar32 ch)
{
// We handle case-insensitive checking of unicode characters which do have both
// cases by handling them as if they were defined using a CharacterClass.
if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) {
m_alternative->m_terms.append(PatternTerm(ch));
return;
}
const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
if (info->type == CanonicalizeUnique) {
m_alternative->m_terms.append(PatternTerm(ch));
return;
}
m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
auto newCharacterClass = m_characterClassConstructor.charClass();
m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
}
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
{
switch (classID) {
2020-08-29 13:27:11 +00:00
case BuiltInCharacterClassID::DigitClassID:
2017-08-12 16:48:01 +00:00
m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
break;
2020-08-29 13:27:11 +00:00
case BuiltInCharacterClassID::SpaceClassID:
2017-08-12 16:48:01 +00:00
m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
break;
2020-08-29 13:27:11 +00:00
case BuiltInCharacterClassID::WordClassID:
2017-08-12 16:48:01 +00:00
if (m_pattern.unicode() && m_pattern.ignoreCase())
m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
else
m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
break;
2020-08-29 13:27:11 +00:00
case BuiltInCharacterClassID::DotClassID:
ASSERT(!invert);
if (m_pattern.dotAll())
m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
else
m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), true));
break;
default:
m_alternative->m_terms.append(PatternTerm(m_pattern.unicodeCharacterClassFor(classID), invert));
2017-08-12 16:48:01 +00:00
break;
}
}
void atomCharacterClassBegin(bool invert = false)
{
m_invertCharacterClass = invert;
}
void atomCharacterClassAtom(UChar32 ch)
{
m_characterClassConstructor.putChar(ch);
}
void atomCharacterClassRange(UChar32 begin, UChar32 end)
{
m_characterClassConstructor.putRange(begin, end);
}
void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
{
2020-08-29 13:27:11 +00:00
ASSERT(classID != BuiltInCharacterClassID::DotClassID);
2017-08-12 16:48:01 +00:00
switch (classID) {
2020-08-29 13:27:11 +00:00
case BuiltInCharacterClassID::DigitClassID:
2017-08-12 16:48:01 +00:00
m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
break;
2020-08-29 13:27:11 +00:00
case BuiltInCharacterClassID::SpaceClassID:
2017-08-12 16:48:01 +00:00
m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
break;
2020-08-29 13:27:11 +00:00
case BuiltInCharacterClassID::WordClassID:
2017-08-12 16:48:01 +00:00
if (m_pattern.unicode() && m_pattern.ignoreCase())
m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
else
m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
break;
default:
2020-08-29 13:27:11 +00:00
if (!invert)
m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID));
else
m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID));
2017-08-12 16:48:01 +00:00
}
}
void atomCharacterClassEnd()
{
auto newCharacterClass = m_characterClassConstructor.charClass();
2020-08-29 13:27:11 +00:00
if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
return;
}
2017-08-12 16:48:01 +00:00
m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
}
2020-08-29 13:27:11 +00:00
void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> optGroupName = WTF::nullopt)
2017-08-12 16:48:01 +00:00
{
unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
2020-08-29 13:27:11 +00:00
if (capture) {
2017-08-12 16:48:01 +00:00
m_pattern.m_numSubpatterns++;
2020-08-29 13:27:11 +00:00
if (optGroupName) {
while (m_pattern.m_captureGroupNames.size() < subpatternId)
m_pattern.m_captureGroupNames.append(String());
m_pattern.m_captureGroupNames.append(optGroupName.value());
m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId);
}
} else
ASSERT(!optGroupName);
2017-08-12 16:48:01 +00:00
auto parenthesesDisjunction = makeUnique<PatternDisjunction>(m_alternative);
2017-08-12 16:48:01 +00:00
m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
m_alternative = parenthesesDisjunction->addNewAlternative();
m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
}
void atomParentheticalAssertionBegin(bool invert = false)
{
auto parenthesesDisjunction = makeUnique<PatternDisjunction>(m_alternative);
2017-08-12 16:48:01 +00:00
m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
m_alternative = parenthesesDisjunction->addNewAlternative();
m_invertParentheticalAssertion = invert;
m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
}
void atomParenthesesEnd()
{
ASSERT(m_alternative->m_parent);
ASSERT(m_alternative->m_parent->m_parent);
PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
m_alternative = m_alternative->m_parent->m_parent;
PatternTerm& lastTerm = m_alternative->lastTerm();
unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
unsigned numBOLAnchoredAlts = 0;
for (unsigned i = 0; i < numParenAlternatives; i++) {
// Bubble up BOL flags
if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
numBOLAnchoredAlts++;
}
if (numBOLAnchoredAlts) {
m_alternative->m_containsBOL = true;
// If all the alternatives in parens start with BOL, then so does this one
if (numBOLAnchoredAlts == numParenAlternatives)
m_alternative->m_startsWithBOL = true;
}
lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
m_invertParentheticalAssertion = false;
}
void atomBackReference(unsigned subpatternId)
{
ASSERT(subpatternId);
m_pattern.m_containsBackreferences = true;
if (subpatternId > m_pattern.m_numSubpatterns) {
m_alternative->m_terms.append(PatternTerm::ForwardReference());
return;
}
PatternAlternative* currentAlternative = m_alternative;
ASSERT(currentAlternative);
// Note to self: if we waited until the AST was baked, we could also remove forwards refs
while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
PatternTerm& term = currentAlternative->lastTerm();
ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
m_alternative->m_terms.append(PatternTerm::ForwardReference());
return;
}
}
m_alternative->m_terms.append(PatternTerm(subpatternId));
}
2020-08-29 13:27:11 +00:00
void atomNamedBackReference(const String& subpatternName)
{
ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName));
}
void atomNamedForwardReference(const String&)
2020-08-29 13:27:11 +00:00
{
m_alternative->m_terms.append(PatternTerm::ForwardReference());
}
// deep copy the argument disjunction. If filterStartsWithBOL is true,
2017-08-12 16:48:01 +00:00
// skip alternatives with m_startsWithBOL set true.
PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
{
2020-08-29 13:27:11 +00:00
if (UNLIKELY(!isSafeToRecurse())) {
m_error = ErrorCode::PatternTooLarge;
return nullptr;
2020-08-29 13:27:11 +00:00
}
2017-08-12 16:48:01 +00:00
std::unique_ptr<PatternDisjunction> newDisjunction;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
if (!newDisjunction) {
newDisjunction = makeUnique<PatternDisjunction>();
2017-08-12 16:48:01 +00:00
newDisjunction->m_parent = disjunction->m_parent;
}
PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
}
}
2020-08-29 13:27:11 +00:00
if (hasError(error())) {
newDisjunction = nullptr;
return nullptr;
2020-08-29 13:27:11 +00:00
}
2017-08-12 16:48:01 +00:00
if (!newDisjunction)
return nullptr;
2017-08-12 16:48:01 +00:00
PatternDisjunction* copiedDisjunction = newDisjunction.get();
m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
return copiedDisjunction;
}
PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
{
2020-08-29 13:27:11 +00:00
if (UNLIKELY(!isSafeToRecurse())) {
m_error = ErrorCode::PatternTooLarge;
return PatternTerm(term);
}
2017-08-12 16:48:01 +00:00
if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
return PatternTerm(term);
PatternTerm termCopy = term;
termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
m_pattern.m_hasCopiedParenSubexpressions = true;
return termCopy;
}
void quantifyAtom(unsigned min, unsigned max, bool greedy)
{
ASSERT(min <= max);
ASSERT(m_alternative->m_terms.size());
if (!max) {
m_alternative->removeLastTerm();
return;
}
PatternTerm& term = m_alternative->lastTerm();
ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount);
if (term.type == PatternTerm::TypeParentheticalAssertion) {
// If an assertion is quantified with a minimum count of zero, it can simply be removed.
// This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
// results in any input being consumed, however the continuation passed to the assertion
// (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
// reject all zero length matches (see step 2.1). A match from the continuation of the
// expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
// this is that matches from the assertion are not required, and won't be accepted anyway,
// so no need to ever run it.
if (!min)
m_alternative->removeLastTerm();
// We never need to run an assertion more than once. Subsequent interations will be run
// with the same start index (since assertions are non-capturing) and the same captures
// (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
// same result and captures. If the first match succeeds then the subsequent (min - 1)
// matches will too. Any additional optional matches will fail (on the same basis as the
// minimum zero quantified assertions, above), but this will still result in a match.
return;
}
if (min == max)
term.quantify(min, max, QuantifierFixedCount);
else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
else {
term.quantify(min, min, QuantifierFixedCount);
m_alternative->m_terms.append(copyTerm(term));
// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
m_alternative->lastTerm().parentheses.isCopy = true;
}
}
void disjunction()
{
m_alternative = m_alternative->m_parent->addNewAlternative();
}
2020-08-29 13:27:11 +00:00
ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
2017-08-12 16:48:01 +00:00
{
if (UNLIKELY(!isSafeToRecurse()))
2020-08-29 13:27:11 +00:00
return ErrorCode::TooManyDisjunctions;
2017-08-12 16:48:01 +00:00
2020-08-29 13:27:11 +00:00
ErrorCode error = ErrorCode::NoError;
2017-08-12 16:48:01 +00:00
alternative->m_hasFixedSize = true;
Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
PatternTerm& term = alternative->m_terms[i];
switch (term.type) {
case PatternTerm::TypeAssertionBOL:
case PatternTerm::TypeAssertionEOL:
case PatternTerm::TypeAssertionWordBoundary:
term.inputPosition = currentInputPosition.unsafeGet();
break;
case PatternTerm::TypeBackReference:
term.inputPosition = currentInputPosition.unsafeGet();
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
alternative->m_hasFixedSize = false;
break;
case PatternTerm::TypeForwardReference:
break;
case PatternTerm::TypePatternCharacter:
term.inputPosition = currentInputPosition.unsafeGet();
if (term.quantityType != QuantifierFixedCount) {
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
alternative->m_hasFixedSize = false;
} else if (m_pattern.unicode()) {
2020-08-29 13:27:11 +00:00
Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
tempCount *= U16_LENGTH(term.patternCharacter);
if (tempCount.hasOverflowed())
return ErrorCode::OffsetTooLarge;
currentInputPosition += tempCount;
2017-08-12 16:48:01 +00:00
} else
currentInputPosition += term.quantityMaxCount;
break;
case PatternTerm::TypeCharacterClass:
term.inputPosition = currentInputPosition.unsafeGet();
if (term.quantityType != QuantifierFixedCount) {
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
alternative->m_hasFixedSize = false;
} else if (m_pattern.unicode()) {
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
2020-08-29 13:27:11 +00:00
if (term.characterClass->hasOneCharacterSize() && !term.invert()) {
Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
tempCount *= term.characterClass->hasNonBMPCharacters() ? 2 : 1;
if (tempCount.hasOverflowed())
return ErrorCode::OffsetTooLarge;
currentInputPosition += tempCount;
} else {
currentInputPosition += term.quantityMaxCount;
alternative->m_hasFixedSize = false;
}
2017-08-12 16:48:01 +00:00
} else
currentInputPosition += term.quantityMaxCount;
break;
case PatternTerm::TypeParenthesesSubpattern:
// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
term.frameLocation = currentCallFrameSize;
if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
2020-08-29 13:27:11 +00:00
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
2017-08-12 16:48:01 +00:00
error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
2020-08-29 13:27:11 +00:00
if (hasError(error))
2017-08-12 16:48:01 +00:00
return error;
// If quantity is fixed, then pre-check its minimum size.
if (term.quantityType == QuantifierFixedCount)
currentInputPosition += term.parentheses.disjunction->m_minimumSize;
term.inputPosition = currentInputPosition.unsafeGet();
} else if (term.parentheses.isTerminal) {
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
2020-08-29 13:27:11 +00:00
if (hasError(error))
2017-08-12 16:48:01 +00:00
return error;
term.inputPosition = currentInputPosition.unsafeGet();
} else {
term.inputPosition = currentInputPosition.unsafeGet();
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
2020-08-29 13:27:11 +00:00
error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
if (hasError(error))
return error;
2017-08-12 16:48:01 +00:00
}
// Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
alternative->m_hasFixedSize = false;
break;
case PatternTerm::TypeParentheticalAssertion:
term.inputPosition = currentInputPosition.unsafeGet();
term.frameLocation = currentCallFrameSize;
error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
2020-08-29 13:27:11 +00:00
if (hasError(error))
2017-08-12 16:48:01 +00:00
return error;
break;
case PatternTerm::TypeDotStarEnclosure:
2018-01-03 05:16:05 +00:00
ASSERT(!m_pattern.m_saveInitialStartValue);
2017-08-12 16:48:01 +00:00
alternative->m_hasFixedSize = false;
term.inputPosition = initialInputPosition;
2018-01-03 05:16:05 +00:00
m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
m_pattern.m_saveInitialStartValue = true;
2017-08-12 16:48:01 +00:00
break;
}
if (currentInputPosition.hasOverflowed())
2020-08-29 13:27:11 +00:00
return ErrorCode::OffsetTooLarge;
2017-08-12 16:48:01 +00:00
}
alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
newCallFrameSize = currentCallFrameSize;
return error;
}
2020-08-29 13:27:11 +00:00
ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
2017-08-12 16:48:01 +00:00
{
if (UNLIKELY(!isSafeToRecurse()))
2020-08-29 13:27:11 +00:00
return ErrorCode::TooManyDisjunctions;
2017-08-12 16:48:01 +00:00
if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
unsigned minimumInputSize = UINT_MAX;
unsigned maximumCallFrameSize = 0;
bool hasFixedSize = true;
2020-08-29 13:27:11 +00:00
ErrorCode error = ErrorCode::NoError;
2017-08-12 16:48:01 +00:00
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
unsigned currentAlternativeCallFrameSize;
error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
2020-08-29 13:27:11 +00:00
if (hasError(error))
2017-08-12 16:48:01 +00:00
return error;
minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
hasFixedSize &= alternative->m_hasFixedSize;
if (alternative->m_minimumSize > INT_MAX)
m_pattern.m_containsUnsignedLengthPattern = true;
}
ASSERT(minimumInputSize != UINT_MAX);
ASSERT(maximumCallFrameSize >= initialCallFrameSize);
disjunction->m_hasFixedSize = hasFixedSize;
disjunction->m_minimumSize = minimumInputSize;
disjunction->m_callFrameSize = maximumCallFrameSize;
callFrameSize = maximumCallFrameSize;
return error;
}
2020-08-29 13:27:11 +00:00
ErrorCode setupOffsets()
2017-08-12 16:48:01 +00:00
{
// FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
unsigned ignoredCallFrameSize;
2020-08-29 13:27:11 +00:00
return setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize);
2017-08-12 16:48:01 +00:00
}
// This optimization identifies sets of parentheses that we will never need to backtrack.
// In these cases we do not need to store state from prior iterations.
// We can presently avoid backtracking for:
// * where the parens are at the end of the regular expression (last term in any of the
// alternatives of the main body disjunction).
// * where the parens are non-capturing, and quantified unbounded greedy (*).
// * where the parens do not contain any capturing subpatterns.
void checkForTerminalParentheses()
{
// This check is much too crude; should be just checking whether the candidate
// node contains nested capturing subpatterns, not the whole expression!
if (m_pattern.m_numSubpatterns)
return;
Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
for (size_t i = 0; i < alternatives.size(); ++i) {
Vector<PatternTerm>& terms = alternatives[i]->m_terms;
if (terms.size()) {
PatternTerm& term = terms.last();
if (term.type == PatternTerm::TypeParenthesesSubpattern
&& term.quantityType == QuantifierGreedy
&& term.quantityMinCount == 0
&& term.quantityMaxCount == quantifyInfinite
&& !term.capture())
term.parentheses.isTerminal = true;
}
}
}
void optimizeBOL()
{
// Look for expressions containing beginning of line (^) anchoring and unroll them.
// e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
// This code relies on the parsing code tagging alternatives with m_containsBOL and
// m_startsWithBOL and rolling those up to containing alternatives.
// At this point, this is only valid for non-multiline expressions.
PatternDisjunction* disjunction = m_pattern.m_body;
if (!m_pattern.m_containsBOL || m_pattern.multiline())
return;
PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
// Set alternatives in disjunction to "onceThrough"
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
disjunction->m_alternatives[alt]->setOnceThrough();
if (loopDisjunction) {
// Move alternatives from loopDisjunction to disjunction
for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release());
loopDisjunction->m_alternatives.clear();
}
}
bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
{
Vector<PatternTerm>& terms = alternative->m_terms;
ASSERT(endIndex <= terms.size());
for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
PatternTerm& term = terms[termIndex];
if (term.m_capture)
return true;
if (term.type == PatternTerm::TypeParenthesesSubpattern) {
PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
return true;
}
}
}
return false;
}
// This optimization identifies alternatives in the form of
// [^].*[?]<expression>.*[$] for expressions that don't have any
// capturing terms. The alternative is changed to <expression>
// followed by processing of the dot stars to find and adjust the
// beginning and the end of the match.
void optimizeDotStarWrappedExpressions()
{
Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
if (alternatives.size() != 1)
return;
2020-08-29 13:27:11 +00:00
CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
2017-08-12 16:48:01 +00:00
PatternAlternative* alternative = alternatives[0].get();
Vector<PatternTerm>& terms = alternative->m_terms;
if (terms.size() >= 3) {
bool startsWithBOL = false;
bool endsWithEOL = false;
size_t termIndex, firstExpressionTerm;
termIndex = 0;
if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
startsWithBOL = true;
++termIndex;
}
PatternTerm& firstNonAnchorTerm = terms[termIndex];
2020-08-29 13:27:11 +00:00
if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass
|| firstNonAnchorTerm.characterClass != dotCharacterClass
|| firstNonAnchorTerm.quantityMinCount
|| firstNonAnchorTerm.quantityMaxCount != quantifyInfinite)
2017-08-12 16:48:01 +00:00
return;
firstExpressionTerm = termIndex + 1;
termIndex = terms.size() - 1;
if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
endsWithEOL = true;
--termIndex;
}
PatternTerm& lastNonAnchorTerm = terms[termIndex];
2020-08-29 13:27:11 +00:00
if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass
|| lastNonAnchorTerm.characterClass != dotCharacterClass
|| lastNonAnchorTerm.quantityType != QuantifierGreedy
|| lastNonAnchorTerm.quantityMinCount
|| lastNonAnchorTerm.quantityMaxCount != quantifyInfinite)
2017-08-12 16:48:01 +00:00
return;
size_t endIndex = termIndex;
if (firstExpressionTerm >= endIndex)
return;
if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex)
terms.remove(termIndex);
for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
terms.remove(termIndex - 1);
terms.append(PatternTerm(startsWithBOL, endsWithEOL));
m_pattern.m_containsBOL = false;
}
}
}
2020-08-29 13:27:11 +00:00
ErrorCode error() { return m_error; }
2017-08-12 16:48:01 +00:00
private:
inline bool isSafeToRecurse() { return m_stackCheck.isSafeToRecurse(); }
2017-08-12 16:48:01 +00:00
YarrPattern& m_pattern;
PatternAlternative* m_alternative;
CharacterClassConstructor m_characterClassConstructor;
StackCheck m_stackCheck;
2020-08-29 13:27:11 +00:00
ErrorCode m_error { ErrorCode::NoError };
2017-08-12 16:48:01 +00:00
bool m_invertCharacterClass;
2020-08-29 13:27:11 +00:00
bool m_invertParentheticalAssertion { false };
2017-08-12 16:48:01 +00:00
};
ErrorCode YarrPattern::compile(const String& patternString)
2017-08-12 16:48:01 +00:00
{
YarrPatternConstructor constructor(*this);
2017-08-12 16:48:01 +00:00
2020-08-29 13:27:11 +00:00
{
ErrorCode error = parse(constructor, patternString, unicode());
if (hasError(error))
return error;
}
2017-08-12 16:48:01 +00:00
constructor.checkForTerminalParentheses();
constructor.optimizeDotStarWrappedExpressions();
constructor.optimizeBOL();
2020-08-29 13:27:11 +00:00
if (hasError(constructor.error()))
return constructor.error();
{
ErrorCode error = constructor.setupOffsets();
if (hasError(error))
return error;
}
if (UNLIKELY(Options::dumpCompiledRegExpPatterns()))
2020-08-29 13:27:11 +00:00
dumpPattern(patternString);
return ErrorCode::NoError;
2017-08-12 16:48:01 +00:00
}
YarrPattern::YarrPattern(const String& pattern, OptionSet<Flags> flags, ErrorCode& error)
2017-08-12 16:48:01 +00:00
: m_containsBackreferences(false)
, m_containsBOL(false)
, m_containsUnsignedLengthPattern(false)
, m_hasCopiedParenSubexpressions(false)
2018-01-03 05:16:05 +00:00
, m_saveInitialStartValue(false)
2017-08-12 16:48:01 +00:00
, m_flags(flags)
{
2020-08-29 13:27:11 +00:00
ASSERT(m_flags != Flags::DeletedValue);
error = compile(pattern);
2020-08-29 13:27:11 +00:00
}
void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
{
out.print(" ");
for (; nestingDepth; --nestingDepth)
out.print(" ");
}
void dumpUChar32(PrintStream& out, UChar32 c)
{
if (c >= ' '&& c <= 0xff)
out.printf("'%c'", static_cast<char>(c));
else
out.printf("0x%04x", c);
}
void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
{
if (characterClass == pattern->anyCharacterClass())
out.print("<any character>");
else if (characterClass == pattern->newlineCharacterClass())
out.print("<newline>");
else if (characterClass == pattern->digitsCharacterClass())
out.print("<digits>");
else if (characterClass == pattern->spacesCharacterClass())
out.print("<whitespace>");
else if (characterClass == pattern->wordcharCharacterClass())
out.print("<word>");
else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
out.print("<unicode word ignore case>");
else if (characterClass == pattern->nondigitsCharacterClass())
out.print("<non-digits>");
else if (characterClass == pattern->nonspacesCharacterClass())
out.print("<non-whitespace>");
else if (characterClass == pattern->nonwordcharCharacterClass())
out.print("<non-word>");
else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
out.print("<unicode non-word ignore case>");
else {
bool needMatchesRangesSeperator = false;
auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
size_t matchesSize = matches.size();
if (matchesSize) {
if (needMatchesRangesSeperator)
out.print(",");
needMatchesRangesSeperator = true;
out.print(prefix, ":(");
for (size_t i = 0; i < matchesSize; ++i) {
if (i)
out.print(",");
dumpUChar32(out, matches[i]);
}
out.print(")");
}
};
auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
size_t rangeSize = ranges.size();
if (rangeSize) {
if (needMatchesRangesSeperator)
out.print(",");
needMatchesRangesSeperator = true;
out.print(prefix, " ranges:(");
for (size_t i = 0; i < rangeSize; ++i) {
if (i)
out.print(",");
CharacterRange range = ranges[i];
out.print("(");
dumpUChar32(out, range.begin);
out.print("..");
dumpUChar32(out, range.end);
out.print(")");
}
out.print(")");
}
};
out.print("[");
dumpMatches("ASCII", characterClass->m_matches);
dumpRanges("ASCII", characterClass->m_ranges);
dumpMatches("Unicode", characterClass->m_matchesUnicode);
dumpRanges("Unicode", characterClass->m_rangesUnicode);
out.print("]");
}
}
void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
{
out.print("minimum size: ", m_minimumSize);
if (m_hasFixedSize)
out.print(",fixed size");
if (m_onceThrough)
out.print(",once through");
if (m_startsWithBOL)
out.print(",starts with ^");
if (m_containsBOL)
out.print(",contains ^");
out.print("\n");
for (size_t i = 0; i < m_terms.size(); ++i)
m_terms[i].dump(out, thisPattern, nestingDepth);
}
void PatternTerm::dumpQuantifier(PrintStream& out)
{
if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1)
return;
out.print(" {", quantityMinCount.unsafeGet());
if (quantityMinCount != quantityMaxCount) {
if (quantityMaxCount == UINT_MAX)
out.print(",...");
else
out.print(",", quantityMaxCount.unsafeGet());
}
out.print("}");
if (quantityType == QuantifierGreedy)
out.print(" greedy");
else if (quantityType == QuantifierNonGreedy)
out.print(" non-greedy");
}
void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
{
indentForNestingLevel(out, nestingDepth);
if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
if (invert())
out.print("not ");
}
switch (type) {
case TypeAssertionBOL:
out.println("BOL");
break;
case TypeAssertionEOL:
out.println("EOL");
break;
case TypeAssertionWordBoundary:
out.println("word boundary");
break;
case TypePatternCharacter:
out.printf("character ");
out.printf("inputPosition %u ", inputPosition);
if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
dumpUChar32(out, toASCIIUpper(patternCharacter));
out.print("/");
dumpUChar32(out, toASCIILower(patternCharacter));
} else
dumpUChar32(out, patternCharacter);
dumpQuantifier(out);
if (quantityType != QuantifierFixedCount)
out.print(",frame location ", frameLocation);
out.println();
break;
case TypeCharacterClass:
out.print("character class ");
out.printf("inputPosition %u ", inputPosition);
dumpCharacterClass(out, thisPattern, characterClass);
dumpQuantifier(out);
if (quantityType != QuantifierFixedCount || thisPattern->unicode())
out.print(",frame location ", frameLocation);
out.println();
break;
case TypeBackReference:
out.print("back reference to subpattern #", backReferenceSubpatternId);
out.println(",frame location ", frameLocation);
break;
case TypeForwardReference:
out.println("forward reference");
break;
case TypeParenthesesSubpattern:
if (m_capture)
out.print("captured ");
else
out.print("non-captured ");
FALLTHROUGH;
case TypeParentheticalAssertion:
if (m_invert)
out.print("inverted ");
if (type == TypeParenthesesSubpattern)
out.print("subpattern");
else if (type == TypeParentheticalAssertion)
out.print("assertion");
if (m_capture)
out.print(" #", parentheses.subpatternId);
dumpQuantifier(out);
if (parentheses.isCopy)
out.print(",copy");
if (parentheses.isTerminal)
out.print(",terminal");
out.println(",frame location ", frameLocation);
if (parentheses.disjunction->m_alternatives.size() > 1) {
indentForNestingLevel(out, nestingDepth + 1);
unsigned alternativeFrameLocation = frameLocation;
if (quantityMaxCount == 1 && !parentheses.isCopy)
alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
else if (parentheses.isTerminal)
alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
else
alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
out.println("alternative list,frame location ", alternativeFrameLocation);
}
parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1);
break;
case TypeDotStarEnclosure:
out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
break;
}
}
void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0)
{
unsigned alternativeCount = m_alternatives.size();
for (unsigned i = 0; i < alternativeCount; ++i) {
indentForNestingLevel(out, nestingDepth);
if (alternativeCount > 1)
out.print("alternative #", i, ": ");
m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1));
}
}
void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString)
{
out.print("/", patternString, "/");
if (global())
out.print("g");
if (ignoreCase())
out.print("i");
if (multiline())
out.print("m");
if (unicode())
out.print("u");
if (sticky())
out.print("y");
}
void YarrPattern::dumpPattern(const String& patternString)
{
dumpPattern(WTF::dataFile(), patternString);
}
void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
{
out.print("RegExp pattern for ");
dumpPatternString(out, patternString);
if (m_flags) {
bool printSeparator = false;
out.print(" (");
if (global()) {
out.print("global");
printSeparator = true;
}
if (ignoreCase()) {
if (printSeparator)
out.print("|");
out.print("ignore case");
printSeparator = true;
}
if (multiline()) {
if (printSeparator)
out.print("|");
out.print("multiline");
printSeparator = true;
}
if (unicode()) {
if (printSeparator)
out.print("|");
out.print("unicode");
printSeparator = true;
}
if (sticky()) {
if (printSeparator)
out.print("|");
out.print("sticky");
}
out.print(")");
}
out.print(":\n");
if (m_body->m_callFrameSize)
out.print(" callframe size: ", m_body->m_callFrameSize, "\n");
m_body->dump(out, this);
}
std::unique_ptr<CharacterClass> anycharCreate()
{
auto characterClass = makeUnique<CharacterClass>();
2020-08-29 13:27:11 +00:00
characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
characterClass->m_characterWidths = CharacterClassWidths::HasBothBMPAndNonBMP;
characterClass->m_anyCharacter = true;
return characterClass;
2017-08-12 16:48:01 +00:00
}
2020-08-29 13:27:11 +00:00
} } // namespace JSC::Yarr