gecko-dev/xpcom/ds/Tokenizer.cpp

806 lines
21 KiB
C++

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "Tokenizer.h"
#include "nsUnicharUtils.h"
#include <algorithm>
namespace mozilla {
template <>
char const TokenizerBase<char>::sWhitespaces[] = {' ', '\t', 0};
template <>
char16_t const TokenizerBase<char16_t>::sWhitespaces[3] = {' ', '\t', 0};
template <typename TChar>
static bool contains(TChar const* const list, TChar const needle) {
for (TChar const* c = list; *c; ++c) {
if (needle == *c) {
return true;
}
}
return false;
}
template <typename TChar>
TTokenizer<TChar>::TTokenizer(const typename base::TAString& aSource,
const TChar* aWhitespaces,
const TChar* aAdditionalWordChars)
: TokenizerBase<TChar>(aWhitespaces, aAdditionalWordChars) {
base::mInputFinished = true;
aSource.BeginReading(base::mCursor);
mRecord = mRollback = base::mCursor;
aSource.EndReading(base::mEnd);
}
template <typename TChar>
TTokenizer<TChar>::TTokenizer(const TChar* aSource, const TChar* aWhitespaces,
const TChar* aAdditionalWordChars)
: TTokenizer(typename base::TDependentString(aSource), aWhitespaces,
aAdditionalWordChars) {}
template <typename TChar>
bool TTokenizer<TChar>::Next(typename base::Token& aToken) {
if (!base::HasInput()) {
base::mHasFailed = true;
return false;
}
mRollback = base::mCursor;
base::mCursor = base::Parse(aToken);
base::AssignFragment(aToken, mRollback, base::mCursor);
base::mPastEof = aToken.Type() == base::TOKEN_EOF;
base::mHasFailed = false;
return true;
}
template <typename TChar>
bool TTokenizer<TChar>::Check(const typename base::TokenType aTokenType,
typename base::Token& aResult) {
if (!base::HasInput()) {
base::mHasFailed = true;
return false;
}
typename base::TAString::const_char_iterator next = base::Parse(aResult);
if (aTokenType != aResult.Type()) {
base::mHasFailed = true;
return false;
}
mRollback = base::mCursor;
base::mCursor = next;
base::AssignFragment(aResult, mRollback, base::mCursor);
base::mPastEof = aResult.Type() == base::TOKEN_EOF;
base::mHasFailed = false;
return true;
}
template <typename TChar>
bool TTokenizer<TChar>::Check(const typename base::Token& aToken) {
#ifdef DEBUG
base::Validate(aToken);
#endif
if (!base::HasInput()) {
base::mHasFailed = true;
return false;
}
typename base::Token parsed;
typename base::TAString::const_char_iterator next = base::Parse(parsed);
if (!aToken.Equals(parsed)) {
base::mHasFailed = true;
return false;
}
mRollback = base::mCursor;
base::mCursor = next;
base::mPastEof = parsed.Type() == base::TOKEN_EOF;
base::mHasFailed = false;
return true;
}
template <typename TChar>
void TTokenizer<TChar>::SkipWhites(WhiteSkipping aIncludeNewLines) {
if (!CheckWhite() &&
(aIncludeNewLines == DONT_INCLUDE_NEW_LINE || !CheckEOL())) {
return;
}
typename base::TAString::const_char_iterator rollback = mRollback;
while (CheckWhite() || (aIncludeNewLines == INCLUDE_NEW_LINE && CheckEOL())) {
}
base::mHasFailed = false;
mRollback = rollback;
}
template <typename TChar>
void TTokenizer<TChar>::SkipUntil(typename base::Token const& aToken) {
typename base::TAString::const_char_iterator rollback = base::mCursor;
const typename base::Token eof = base::Token::EndOfFile();
typename base::Token t;
while (Next(t)) {
if (aToken.Equals(t) || eof.Equals(t)) {
Rollback();
break;
}
}
mRollback = rollback;
}
template <typename TChar>
bool TTokenizer<TChar>::CheckChar(bool (*aClassifier)(const TChar aChar)) {
if (!aClassifier) {
MOZ_ASSERT(false);
return false;
}
if (!base::HasInput() || base::mCursor == base::mEnd) {
base::mHasFailed = true;
return false;
}
if (!aClassifier(*base::mCursor)) {
base::mHasFailed = true;
return false;
}
mRollback = base::mCursor;
++base::mCursor;
base::mHasFailed = false;
return true;
}
template <typename TChar>
bool TTokenizer<TChar>::CheckPhrase(const typename base::TAString& aPhrase) {
if (!base::HasInput()) {
return false;
}
typedef typename base::TAString::const_char_iterator Cursor;
TTokenizer<TChar> pattern(aPhrase);
MOZ_ASSERT(!pattern.CheckEOF(),
"This will return true but won't shift the Tokenizer's cursor");
return [&](Cursor cursor, Cursor rollback) mutable {
while (true) {
if (pattern.CheckEOF()) {
base::mHasFailed = false;
mRollback = cursor;
return true;
}
typename base::Token t1, t2;
Unused << Next(t1);
Unused << pattern.Next(t2);
if (t1.Type() == t2.Type() && t1.Fragment().Equals(t2.Fragment())) {
continue;
}
break;
}
base::mHasFailed = true;
base::mPastEof = false;
base::mCursor = cursor;
mRollback = rollback;
return false;
}(base::mCursor, mRollback);
}
template <typename TChar>
bool TTokenizer<TChar>::ReadChar(TChar* aValue) {
MOZ_RELEASE_ASSERT(aValue);
typename base::Token t;
if (!Check(base::TOKEN_CHAR, t)) {
return false;
}
*aValue = t.AsChar();
return true;
}
template <typename TChar>
bool TTokenizer<TChar>::ReadChar(bool (*aClassifier)(const TChar aChar),
TChar* aValue) {
MOZ_RELEASE_ASSERT(aValue);
if (!CheckChar(aClassifier)) {
return false;
}
*aValue = *mRollback;
return true;
}
template <typename TChar>
bool TTokenizer<TChar>::ReadWord(typename base::TAString& aValue) {
typename base::Token t;
if (!Check(base::TOKEN_WORD, t)) {
return false;
}
aValue.Assign(t.AsString());
return true;
}
template <typename TChar>
bool TTokenizer<TChar>::ReadWord(typename base::TDependentSubstring& aValue) {
typename base::Token t;
if (!Check(base::TOKEN_WORD, t)) {
return false;
}
aValue.Rebind(t.AsString().BeginReading(), t.AsString().Length());
return true;
}
template <typename TChar>
bool TTokenizer<TChar>::ReadUntil(typename base::Token const& aToken,
typename base::TAString& aResult,
ClaimInclusion aInclude) {
typename base::TDependentSubstring substring;
bool rv = ReadUntil(aToken, substring, aInclude);
aResult.Assign(substring);
return rv;
}
template <typename TChar>
bool TTokenizer<TChar>::ReadUntil(typename base::Token const& aToken,
typename base::TDependentSubstring& aResult,
ClaimInclusion aInclude) {
typename base::TAString::const_char_iterator record = mRecord;
Record();
typename base::TAString::const_char_iterator rollback = mRollback =
base::mCursor;
bool found = false;
typename base::Token t;
while (Next(t)) {
if (aToken.Equals(t)) {
found = true;
break;
}
if (t.Equals(base::Token::EndOfFile())) {
// We don't want to eat it.
Rollback();
break;
}
}
Claim(aResult, aInclude);
mRollback = rollback;
mRecord = record;
return found;
}
template <typename TChar>
void TTokenizer<TChar>::Rollback() {
MOZ_ASSERT(base::mCursor > mRollback || base::mPastEof, "TODO!!!");
base::mPastEof = false;
base::mHasFailed = false;
base::mCursor = mRollback;
}
template <typename TChar>
void TTokenizer<TChar>::Record(ClaimInclusion aInclude) {
mRecord = aInclude == INCLUDE_LAST ? mRollback : base::mCursor;
}
template <typename TChar>
void TTokenizer<TChar>::Claim(typename base::TAString& aResult,
ClaimInclusion aInclusion) {
typename base::TAString::const_char_iterator close =
aInclusion == EXCLUDE_LAST ? mRollback : base::mCursor;
aResult.Assign(Substring(mRecord, close));
}
template <typename TChar>
void TTokenizer<TChar>::Claim(typename base::TDependentSubstring& aResult,
ClaimInclusion aInclusion) {
typename base::TAString::const_char_iterator close =
aInclusion == EXCLUDE_LAST ? mRollback : base::mCursor;
MOZ_RELEASE_ASSERT(close >= mRecord, "Overflow!");
aResult.Rebind(mRecord, close - mRecord);
}
// TokenizerBase
template <typename TChar>
TokenizerBase<TChar>::TokenizerBase(const TChar* aWhitespaces,
const TChar* aAdditionalWordChars)
: mPastEof(false),
mHasFailed(false),
mInputFinished(true),
mMode(Mode::FULL),
mMinRawDelivery(1024),
mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces),
mAdditionalWordChars(aAdditionalWordChars),
mCursor(nullptr),
mEnd(nullptr),
mNextCustomTokenID(TOKEN_CUSTOM0) {}
template <typename TChar>
auto TokenizerBase<TChar>::AddCustomToken(const TAString& aValue,
ECaseSensitivity aCaseInsensitivity,
bool aEnabled) -> Token {
MOZ_ASSERT(!aValue.IsEmpty());
UniquePtr<Token>& t = *mCustomTokens.AppendElement();
t = MakeUnique<Token>();
t->mType = static_cast<TokenType>(++mNextCustomTokenID);
t->mCustomCaseInsensitivity = aCaseInsensitivity;
t->mCustomEnabled = aEnabled;
t->mCustom.Assign(aValue);
return *t;
}
template <typename TChar>
void TokenizerBase<TChar>::RemoveCustomToken(Token& aToken) {
if (aToken.mType == TOKEN_UNKNOWN) {
// Already removed
return;
}
for (UniquePtr<Token> const& custom : mCustomTokens) {
if (custom->mType == aToken.mType) {
mCustomTokens.RemoveElement(custom);
aToken.mType = TOKEN_UNKNOWN;
return;
}
}
MOZ_ASSERT(false, "Token to remove not found");
}
template <typename TChar>
void TokenizerBase<TChar>::EnableCustomToken(Token const& aToken,
bool aEnabled) {
if (aToken.mType == TOKEN_UNKNOWN) {
// Already removed
return;
}
for (UniquePtr<Token> const& custom : mCustomTokens) {
if (custom->Type() == aToken.Type()) {
// This effectively destroys the token instance.
custom->mCustomEnabled = aEnabled;
return;
}
}
MOZ_ASSERT(false, "Token to change not found");
}
template <typename TChar>
void TokenizerBase<TChar>::SetTokenizingMode(Mode aMode) {
mMode = aMode;
}
template <typename TChar>
bool TokenizerBase<TChar>::HasFailed() const {
return mHasFailed;
}
template <typename TChar>
bool TokenizerBase<TChar>::HasInput() const {
return !mPastEof;
}
template <typename TChar>
auto TokenizerBase<TChar>::Parse(Token& aToken) const ->
typename TAString::const_char_iterator {
if (mCursor == mEnd) {
if (!mInputFinished) {
return mCursor;
}
aToken = Token::EndOfFile();
return mEnd;
}
MOZ_RELEASE_ASSERT(mEnd >= mCursor, "Overflow!");
typename TAString::size_type available = mEnd - mCursor;
uint32_t longestCustom = 0;
for (UniquePtr<Token> const& custom : mCustomTokens) {
if (IsCustom(mCursor, *custom, &longestCustom)) {
aToken = *custom;
return mCursor + custom->mCustom.Length();
}
}
if (!mInputFinished && available < longestCustom) {
// Not enough data to deterministically decide.
return mCursor;
}
typename TAString::const_char_iterator next = mCursor;
if (mMode == Mode::CUSTOM_ONLY) {
// We have to do a brute-force search for all of the enabled custom
// tokens.
while (next < mEnd) {
++next;
for (UniquePtr<Token> const& custom : mCustomTokens) {
if (IsCustom(next, *custom)) {
aToken = Token::Raw();
return next;
}
}
}
if (mInputFinished) {
// End of the data reached.
aToken = Token::Raw();
return next;
}
if (longestCustom < available && available > mMinRawDelivery) {
// We can return some data w/o waiting for either a custom token
// or call to FinishData() when we leave the tail where all the
// custom tokens potentially fit, so we can't lose only partially
// delivered tokens. This preserves reasonable granularity.
aToken = Token::Raw();
return mEnd - longestCustom + 1;
}
// Not enough data to deterministically decide.
return mCursor;
}
enum State {
PARSE_INTEGER,
PARSE_WORD,
PARSE_CRLF,
PARSE_LF,
PARSE_WS,
PARSE_CHAR,
} state;
if (IsWordFirst(*next)) {
state = PARSE_WORD;
} else if (IsNumber(*next)) {
state = PARSE_INTEGER;
} else if (contains(mWhitespaces, *next)) { // not UTF-8 friendly?
state = PARSE_WS;
} else if (*next == '\r') {
state = PARSE_CRLF;
} else if (*next == '\n') {
state = PARSE_LF;
} else {
state = PARSE_CHAR;
}
mozilla::CheckedUint64 resultingNumber = 0;
while (next < mEnd) {
switch (state) {
case PARSE_INTEGER:
// Keep it simple for now
resultingNumber *= 10;
resultingNumber += static_cast<uint64_t>(*next - '0');
++next;
if (IsPending(next)) {
break;
}
if (IsEnd(next) || !IsNumber(*next)) {
if (!resultingNumber.isValid()) {
aToken = Token::Error();
} else {
aToken = Token::Number(resultingNumber.value());
}
return next;
}
break;
case PARSE_WORD:
++next;
if (IsPending(next)) {
break;
}
if (IsEnd(next) || !IsWord(*next)) {
aToken = Token::Word(Substring(mCursor, next));
return next;
}
break;
case PARSE_CRLF:
++next;
if (IsPending(next)) {
break;
}
if (!IsEnd(next) && *next == '\n') { // LF is optional
++next;
}
aToken = Token::NewLine();
return next;
case PARSE_LF:
++next;
aToken = Token::NewLine();
return next;
case PARSE_WS:
++next;
aToken = Token::Whitespace();
return next;
case PARSE_CHAR:
++next;
aToken = Token::Char(*mCursor);
return next;
} // switch (state)
} // while (next < end)
MOZ_ASSERT(!mInputFinished);
return mCursor;
}
template <typename TChar>
bool TokenizerBase<TChar>::IsEnd(
const typename TAString::const_char_iterator& caret) const {
return caret == mEnd;
}
template <typename TChar>
bool TokenizerBase<TChar>::IsPending(
const typename TAString::const_char_iterator& caret) const {
return IsEnd(caret) && !mInputFinished;
}
template <typename TChar>
bool TokenizerBase<TChar>::IsWordFirst(const TChar aInput) const {
// TODO: make this fully work with unicode
return (ToLowerCase(static_cast<uint32_t>(aInput)) !=
ToUpperCase(static_cast<uint32_t>(aInput))) ||
'_' == aInput ||
(mAdditionalWordChars ? contains(mAdditionalWordChars, aInput)
: false);
}
template <typename TChar>
bool TokenizerBase<TChar>::IsWord(const TChar aInput) const {
return IsWordFirst(aInput) || IsNumber(aInput);
}
template <typename TChar>
bool TokenizerBase<TChar>::IsNumber(const TChar aInput) const {
// TODO: are there unicode numbers?
return aInput >= '0' && aInput <= '9';
}
template <typename TChar>
bool TokenizerBase<TChar>::IsCustom(
const typename TAString::const_char_iterator& caret,
const Token& aCustomToken, uint32_t* aLongest) const {
MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0);
if (!aCustomToken.mCustomEnabled) {
return false;
}
if (aLongest) {
*aLongest = std::max<uint32_t>(*aLongest, aCustomToken.mCustom.Length());
}
// This is not very likely to happen according to how we call this method
// and since it's on a hot path, it's just a diagnostic assert,
// not a release assert.
MOZ_DIAGNOSTIC_ASSERT(mEnd >= caret, "Overflow?");
uint32_t inputLength = mEnd - caret;
if (aCustomToken.mCustom.Length() > inputLength) {
return false;
}
TDependentSubstring inputFragment(caret, aCustomToken.mCustom.Length());
if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) {
if constexpr (std::is_same_v<TChar, char>) {
return inputFragment.Equals(aCustomToken.mCustom,
nsCaseInsensitiveUTF8StringComparator);
} else {
return inputFragment.Equals(aCustomToken.mCustom,
nsCaseInsensitiveStringComparator);
}
}
return inputFragment.Equals(aCustomToken.mCustom);
}
template <typename TChar>
void TokenizerBase<TChar>::AssignFragment(
Token& aToken, typename TAString::const_char_iterator begin,
typename TAString::const_char_iterator end) {
aToken.AssignFragment(begin, end);
}
#ifdef DEBUG
template <typename TChar>
void TokenizerBase<TChar>::Validate(Token const& aToken) {
if (aToken.Type() == TOKEN_WORD) {
typename TAString::const_char_iterator c = aToken.AsString().BeginReading();
typename TAString::const_char_iterator e = aToken.AsString().EndReading();
if (c < e) {
MOZ_ASSERT(IsWordFirst(*c));
while (++c < e) {
MOZ_ASSERT(IsWord(*c));
}
}
}
}
#endif
// TokenizerBase::Token
template <typename TChar>
TokenizerBase<TChar>::Token::Token()
: mType(TOKEN_UNKNOWN),
mChar(0),
mInteger(0),
mCustomCaseInsensitivity(CASE_SENSITIVE),
mCustomEnabled(false) {}
template <typename TChar>
TokenizerBase<TChar>::Token::Token(const Token& aOther)
: mType(aOther.mType),
mCustom(aOther.mCustom),
mChar(aOther.mChar),
mInteger(aOther.mInteger),
mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity),
mCustomEnabled(aOther.mCustomEnabled) {
if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) {
mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
}
}
template <typename TChar>
auto TokenizerBase<TChar>::Token::operator=(const Token& aOther) -> Token& {
mType = aOther.mType;
mCustom = aOther.mCustom;
mChar = aOther.mChar;
mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
mInteger = aOther.mInteger;
mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity;
mCustomEnabled = aOther.mCustomEnabled;
return *this;
}
template <typename TChar>
void TokenizerBase<TChar>::Token::AssignFragment(
typename TAString::const_char_iterator begin,
typename TAString::const_char_iterator end) {
MOZ_RELEASE_ASSERT(end >= begin, "Overflow!");
mFragment.Rebind(begin, end - begin);
}
// static
template <typename TChar>
auto TokenizerBase<TChar>::Token::Raw() -> Token {
Token t;
t.mType = TOKEN_RAW;
return t;
}
// static
template <typename TChar>
auto TokenizerBase<TChar>::Token::Word(TAString const& aValue) -> Token {
Token t;
t.mType = TOKEN_WORD;
t.mWord.Rebind(aValue.BeginReading(), aValue.Length());
return t;
}
// static
template <typename TChar>
auto TokenizerBase<TChar>::Token::Char(TChar const aValue) -> Token {
Token t;
t.mType = TOKEN_CHAR;
t.mChar = aValue;
return t;
}
// static
template <typename TChar>
auto TokenizerBase<TChar>::Token::Number(uint64_t const aValue) -> Token {
Token t;
t.mType = TOKEN_INTEGER;
t.mInteger = aValue;
return t;
}
// static
template <typename TChar>
auto TokenizerBase<TChar>::Token::Whitespace() -> Token {
Token t;
t.mType = TOKEN_WS;
t.mChar = '\0';
return t;
}
// static
template <typename TChar>
auto TokenizerBase<TChar>::Token::NewLine() -> Token {
Token t;
t.mType = TOKEN_EOL;
return t;
}
// static
template <typename TChar>
auto TokenizerBase<TChar>::Token::EndOfFile() -> Token {
Token t;
t.mType = TOKEN_EOF;
return t;
}
// static
template <typename TChar>
auto TokenizerBase<TChar>::Token::Error() -> Token {
Token t;
t.mType = TOKEN_ERROR;
return t;
}
template <typename TChar>
bool TokenizerBase<TChar>::Token::Equals(const Token& aOther) const {
if (mType != aOther.mType) {
return false;
}
switch (mType) {
case TOKEN_INTEGER:
return AsInteger() == aOther.AsInteger();
case TOKEN_WORD:
return AsString() == aOther.AsString();
case TOKEN_CHAR:
return AsChar() == aOther.AsChar();
default:
return true;
}
}
template <typename TChar>
TChar TokenizerBase<TChar>::Token::AsChar() const {
MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS);
return mChar;
}
template <typename TChar>
auto TokenizerBase<TChar>::Token::AsString() const -> TDependentSubstring {
MOZ_ASSERT(mType == TOKEN_WORD);
return mWord;
}
template <typename TChar>
uint64_t TokenizerBase<TChar>::Token::AsInteger() const {
MOZ_ASSERT(mType == TOKEN_INTEGER);
return mInteger;
}
template class TokenizerBase<char>;
template class TokenizerBase<char16_t>;
template class TTokenizer<char>;
template class TTokenizer<char16_t>;
} // namespace mozilla