mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-12-13 18:27:35 +00:00
63c6b08058
The main change is that once we discover we have a bad-url-token we consume everything up to, but not including, the next ')' character. While we do this we can cross line boundaries and don't bother about matching braces or quotes. We just keep going until we find the ')' or hit EOF.
411 lines
15 KiB
C++
411 lines
15 KiB
C++
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
/* tokenization of CSS style sheets */
|
|
|
|
#ifndef nsCSSScanner_h___
|
|
#define nsCSSScanner_h___
|
|
|
|
#include "nsString.h"
|
|
|
|
namespace mozilla {
|
|
namespace css {
|
|
class ErrorReporter;
|
|
} // namespace css
|
|
} // namespace mozilla
|
|
|
|
// Token types; in close but not perfect correspondence to the token
|
|
// categorization in section 4.1.1 of CSS2.1. (The deviations are all
|
|
// the fault of css3-selectors, which has requirements that can only be
|
|
// met by changing the generic tokenization.) The comment on each line
|
|
// illustrates the form of each identifier.
|
|
|
|
enum nsCSSTokenType {
|
|
// White space of any kind. No value fields are used. Note that
|
|
// comments do *not* count as white space; comments separate tokens
|
|
// but are not themselves tokens.
|
|
eCSSToken_Whitespace, //
|
|
// A comment.
|
|
eCSSToken_Comment, // /*...*/
|
|
|
|
// Identifier-like tokens. mIdent is the text of the identifier.
|
|
// The difference between ID and Hash is: if the text after the #
|
|
// would have been a valid Ident if the # hadn't been there, the
|
|
// scanner produces an ID token. Otherwise it produces a Hash token.
|
|
// (This distinction is required by css3-selectors.)
|
|
eCSSToken_Ident, // word
|
|
eCSSToken_Function, // word(
|
|
eCSSToken_AtKeyword, // @word
|
|
eCSSToken_ID, // #word
|
|
eCSSToken_Hash, // #0word
|
|
|
|
// Numeric tokens. mNumber is the floating-point value of the
|
|
// number, and mHasSign indicates whether there was an explicit sign
|
|
// (+ or -) in front of the number. If mIntegerValid is true, the
|
|
// number had the lexical form of an integer, and mInteger is its
|
|
// integer value. Lexically integer values outside the range of a
|
|
// 32-bit signed number are clamped to the maximum values; mNumber
|
|
// will indicate a 'truer' value in that case. Percentage tokens
|
|
// are always considered not to be integers, even if their numeric
|
|
// value is integral (100% => mNumber = 1.0). For Dimension
|
|
// tokens, mIdent holds the text of the unit.
|
|
eCSSToken_Number, // 1 -5 +2e3 3.14159 7.297352e-3
|
|
eCSSToken_Dimension, // 24px 8.5in
|
|
eCSSToken_Percentage, // 85% 1280.4%
|
|
|
|
// String-like tokens. In all cases, mIdent holds the text
|
|
// belonging to the string, and mSymbol holds the delimiter
|
|
// character, which may be ', ", or zero (only for unquoted URLs).
|
|
// Bad_String and Bad_URL tokens are emitted when the closing
|
|
// delimiter was missing. Bad_URL is also emitted if there was trailing
|
|
// garbage after the string or unquoted url value.
|
|
eCSSToken_String, // 'foo bar' "foo bar"
|
|
eCSSToken_Bad_String, // 'foo bar
|
|
eCSSToken_URL, // url(foobar) url("foo bar")
|
|
// For Bad_URL tokens, we need to keep track of the following state:
|
|
// (1) Was there a quoted string? If so, was it a String or Bad_String?
|
|
// (2) Was there trailing garbage, and if so what was it?
|
|
// We keep track of whether there was a quoted string by setting mSymbol as
|
|
// described above. If that's nonzero, then mInteger2 indicates whether we
|
|
// have a String or Bad_String by taking on the values 0 and 1 respectively.
|
|
// mInteger indicates the start of trailing garbage in mIdent (and is set to
|
|
// mIdent.Length() when there is no trailing garbage).
|
|
eCSSToken_Bad_URL, // url(foo') url('foo'a) url('foo
|
|
|
|
// Any one-character symbol. mSymbol holds the character.
|
|
eCSSToken_Symbol, // . ; { } ! *
|
|
|
|
// Match operators. These are single tokens rather than pairs of
|
|
// Symbol tokens because css3-selectors forbids the presence of
|
|
// comments between the two characters. No value fields are used;
|
|
// the token type indicates which operator.
|
|
eCSSToken_Includes, // ~=
|
|
eCSSToken_Dashmatch, // |=
|
|
eCSSToken_Beginsmatch, // ^=
|
|
eCSSToken_Endsmatch, // $=
|
|
eCSSToken_Containsmatch, // *=
|
|
|
|
// Unicode-range token: currently used only in @font-face.
|
|
// The lexical rule for this token includes several forms that are
|
|
// semantically invalid. Therefore, mIdent always holds the
|
|
// complete original text of the token (so we can print it
|
|
// accurately in diagnostics), and mIntegerValid is true iff the
|
|
// token is semantically valid. In that case, mInteger holds the
|
|
// lowest value included in the range, and mInteger2 holds the
|
|
// highest value included in the range.
|
|
eCSSToken_URange, // U+007e U+01?? U+2000-206F
|
|
|
|
// HTML comment delimiters, ignored as a unit when they appear at
|
|
// the top level of a style sheet, for compatibility with websites
|
|
// written for compatibility with pre-CSS browsers. This token type
|
|
// subsumes the css2.1 CDO and CDC tokens, which are always treated
|
|
// the same by the parser. mIdent holds the text of the token, for
|
|
// diagnostics.
|
|
eCSSToken_HTMLComment, // <!-- -->
|
|
};
|
|
|
|
// Classification of tokens used to determine if a "/**/" string must be
|
|
// inserted if pasting token streams together when serializing. We include
|
|
// values corresponding to eCSSToken_Dashmatch and eCSSToken_Containsmatch,
|
|
// as css-syntax does not treat these as whole tokens, but we will still
|
|
// need to insert a "/**/" string between a '|' delim and a '|=' dashmatch
|
|
// and between a '/' delim and a '*=' containsmatch.
|
|
//
|
|
// https://drafts.csswg.org/css-syntax/#serialization
|
|
enum nsCSSTokenSerializationType {
|
|
eCSSTokenSerialization_Nothing,
|
|
eCSSTokenSerialization_Whitespace,
|
|
eCSSTokenSerialization_AtKeyword_or_Hash,
|
|
eCSSTokenSerialization_Number,
|
|
eCSSTokenSerialization_Dimension,
|
|
eCSSTokenSerialization_Percentage,
|
|
eCSSTokenSerialization_URange,
|
|
eCSSTokenSerialization_URL_or_BadURL,
|
|
eCSSTokenSerialization_Function,
|
|
eCSSTokenSerialization_Ident,
|
|
eCSSTokenSerialization_CDC,
|
|
eCSSTokenSerialization_DashMatch,
|
|
eCSSTokenSerialization_ContainsMatch,
|
|
eCSSTokenSerialization_Symbol_Hash, // '#'
|
|
eCSSTokenSerialization_Symbol_At, // '@'
|
|
eCSSTokenSerialization_Symbol_Dot_or_Plus, // '.', '+'
|
|
eCSSTokenSerialization_Symbol_Minus, // '-'
|
|
eCSSTokenSerialization_Symbol_OpenParen, // '('
|
|
eCSSTokenSerialization_Symbol_Question, // '?'
|
|
eCSSTokenSerialization_Symbol_Assorted, // '$', '^', '~'
|
|
eCSSTokenSerialization_Symbol_Equals, // '='
|
|
eCSSTokenSerialization_Symbol_Bar, // '|'
|
|
eCSSTokenSerialization_Symbol_Slash, // '/'
|
|
eCSSTokenSerialization_Symbol_Asterisk, // '*'
|
|
eCSSTokenSerialization_Other // anything else
|
|
};
|
|
|
|
// A single token returned from the scanner. mType is always
|
|
// meaningful; comments above describe which other fields are
|
|
// meaningful for which token types.
|
|
struct nsCSSToken {
|
|
nsAutoString mIdent;
|
|
float mNumber;
|
|
int32_t mInteger;
|
|
int32_t mInteger2;
|
|
nsCSSTokenType mType;
|
|
char16_t mSymbol;
|
|
bool mIntegerValid;
|
|
bool mHasSign;
|
|
|
|
nsCSSToken()
|
|
: mNumber(0), mInteger(0), mInteger2(0), mType(eCSSToken_Whitespace),
|
|
mSymbol('\0'), mIntegerValid(false), mHasSign(false)
|
|
{}
|
|
|
|
bool IsSymbol(char16_t aSymbol) const {
|
|
return mType == eCSSToken_Symbol && mSymbol == aSymbol;
|
|
}
|
|
|
|
void AppendToString(nsString& aBuffer) const;
|
|
};
|
|
|
|
// Represents an nsCSSScanner's saved position in the input buffer.
|
|
class nsCSSScannerPosition {
|
|
friend class nsCSSScanner;
|
|
public:
|
|
nsCSSScannerPosition() : mInitialized(false) { }
|
|
|
|
uint32_t LineNumber() {
|
|
MOZ_ASSERT(mInitialized);
|
|
return mLineNumber;
|
|
}
|
|
|
|
uint32_t LineOffset() {
|
|
MOZ_ASSERT(mInitialized);
|
|
return mLineOffset;
|
|
}
|
|
|
|
private:
|
|
uint32_t mOffset;
|
|
uint32_t mLineNumber;
|
|
uint32_t mLineOffset;
|
|
uint32_t mTokenLineNumber;
|
|
uint32_t mTokenLineOffset;
|
|
uint32_t mTokenOffset;
|
|
bool mInitialized;
|
|
};
|
|
|
|
enum nsCSSScannerExclude {
|
|
// Return all tokens, including whitespace and comments.
|
|
eCSSScannerExclude_None,
|
|
// Include whitespace but exclude comments.
|
|
eCSSScannerExclude_Comments,
|
|
// Exclude whitespace and comments.
|
|
eCSSScannerExclude_WhitespaceAndComments
|
|
};
|
|
|
|
// nsCSSScanner tokenizes an input stream using the CSS2.1 forward
|
|
// compatible tokenization rules. Used internally by nsCSSParser;
|
|
// not available for use by other code.
|
|
class nsCSSScanner {
|
|
public:
|
|
// |aLineNumber == 1| is the beginning of a file, use |aLineNumber == 0|
|
|
// when the line number is unknown. The scanner does not take
|
|
// ownership of |aBuffer|, so the caller must be sure to keep it
|
|
// alive for the lifetime of the scanner.
|
|
nsCSSScanner(const nsAString& aBuffer, uint32_t aLineNumber);
|
|
~nsCSSScanner();
|
|
|
|
void SetErrorReporter(mozilla::css::ErrorReporter* aReporter) {
|
|
mReporter = aReporter;
|
|
}
|
|
// Set whether or not we are processing SVG
|
|
void SetSVGMode(bool aSVGMode) {
|
|
mSVGMode = aSVGMode;
|
|
}
|
|
bool IsSVGMode() const {
|
|
return mSVGMode;
|
|
}
|
|
|
|
// Reset or check whether a BAD_URL or BAD_STRING token has been seen.
|
|
void ClearSeenBadToken() { mSeenBadToken = false; }
|
|
bool SeenBadToken() const { return mSeenBadToken; }
|
|
|
|
// Reset or check whether a "var(" FUNCTION token has been seen.
|
|
void ClearSeenVariableReference() { mSeenVariableReference = false; }
|
|
bool SeenVariableReference() const { return mSeenVariableReference; }
|
|
|
|
// Get the 1-based line number of the last character of
|
|
// the most recently processed token.
|
|
uint32_t GetLineNumber() const { return mTokenLineNumber; }
|
|
|
|
// Get the 0-based column number of the first character of
|
|
// the most recently processed token.
|
|
uint32_t GetColumnNumber() const
|
|
{ return mTokenOffset - mTokenLineOffset; }
|
|
|
|
uint32_t GetTokenOffset() const
|
|
{ return mTokenOffset; }
|
|
|
|
uint32_t GetTokenEndOffset() const
|
|
{ return mOffset; }
|
|
|
|
// Get the text of the line containing the first character of
|
|
// the most recently processed token.
|
|
nsDependentSubstring GetCurrentLine() const;
|
|
|
|
// Get the next token. Return false on EOF. aTokenResult is filled
|
|
// in with the data for the token. aSkip controls whether
|
|
// whitespace and/or comment tokens are ever returned.
|
|
bool Next(nsCSSToken& aTokenResult, nsCSSScannerExclude aSkip);
|
|
|
|
// Get the body of an URL token (everything after the 'url(').
|
|
// This is exposed for use by nsCSSParser::ParseMozDocumentRule,
|
|
// which, for historical reasons, must make additional function
|
|
// tokens behave like url(). Please do not add new uses to the
|
|
// parser.
|
|
void NextURL(nsCSSToken& aTokenResult);
|
|
|
|
// Implement the "consume the remnants of a bad url" algorithm from CSS3
|
|
// Syntax, except we don't consume the ')'.
|
|
void ConsumeBadURLRemnants(nsCSSToken& aToken);
|
|
|
|
// This is exposed for use by nsCSSParser::ParsePseudoClassWithNthPairArg,
|
|
// because "2n-1" is a single DIMENSION token, and "n-1" is a single
|
|
// IDENT token, but the :nth() selector syntax wants to interpret
|
|
// them the same as "2n -1" and "n -1" respectively. Please do not
|
|
// add new uses to the parser.
|
|
//
|
|
// Note: this function may not be used to back up over a line boundary.
|
|
void Backup(uint32_t n);
|
|
|
|
// Starts recording the input stream from the current position.
|
|
void StartRecording();
|
|
|
|
// Abandons recording of the input stream.
|
|
void StopRecording();
|
|
|
|
// Stops recording of the input stream and appends the recorded
|
|
// input to aBuffer.
|
|
void StopRecording(nsString& aBuffer);
|
|
|
|
// Returns the length of the current recording.
|
|
uint32_t RecordingLength() const;
|
|
|
|
#ifdef DEBUG
|
|
bool IsRecording() const;
|
|
#endif
|
|
|
|
// Stores the current scanner offset into the specified object.
|
|
void SavePosition(nsCSSScannerPosition& aState);
|
|
|
|
// Resets the scanner offset to a position saved by SavePosition.
|
|
void RestoreSavedPosition(const nsCSSScannerPosition& aState);
|
|
|
|
enum EOFCharacters {
|
|
eEOFCharacters_None = 0x0000,
|
|
|
|
// to handle \<EOF> inside strings
|
|
eEOFCharacters_DropBackslash = 0x0001,
|
|
|
|
// to handle \<EOF> outside strings
|
|
eEOFCharacters_ReplacementChar = 0x0002,
|
|
|
|
// to close comments
|
|
eEOFCharacters_Asterisk = 0x0004,
|
|
eEOFCharacters_Slash = 0x0008,
|
|
|
|
// to close double-quoted strings
|
|
eEOFCharacters_DoubleQuote = 0x0010,
|
|
|
|
// to close single-quoted strings
|
|
eEOFCharacters_SingleQuote = 0x0020,
|
|
|
|
// to close URLs
|
|
eEOFCharacters_CloseParen = 0x0040,
|
|
};
|
|
|
|
// Appends any characters to the specified string the input stream to make the
|
|
// last token not rely on special EOF handling behavior.
|
|
//
|
|
// If eEOFCharacters_DropBackslash is in aEOFCharacters, it is ignored.
|
|
static void AppendImpliedEOFCharacters(EOFCharacters aEOFCharacters,
|
|
nsAString& aString);
|
|
|
|
EOFCharacters GetEOFCharacters() const {
|
|
#ifdef DEBUG
|
|
AssertEOFCharactersValid(mEOFCharacters);
|
|
#endif
|
|
return mEOFCharacters;
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
static void AssertEOFCharactersValid(uint32_t c);
|
|
#endif
|
|
|
|
protected:
|
|
int32_t Peek(uint32_t n = 0);
|
|
void Advance(uint32_t n = 1);
|
|
void AdvanceLine();
|
|
|
|
void SkipWhitespace();
|
|
void SkipComment();
|
|
|
|
bool GatherEscape(nsString& aOutput, bool aInString);
|
|
bool GatherText(uint8_t aClass, nsString& aIdent);
|
|
|
|
bool ScanIdent(nsCSSToken& aResult);
|
|
bool ScanAtKeyword(nsCSSToken& aResult);
|
|
bool ScanHash(nsCSSToken& aResult);
|
|
bool ScanNumber(nsCSSToken& aResult);
|
|
bool ScanString(nsCSSToken& aResult);
|
|
bool ScanURange(nsCSSToken& aResult);
|
|
|
|
void SetEOFCharacters(uint32_t aEOFCharacters);
|
|
void AddEOFCharacters(uint32_t aEOFCharacters);
|
|
|
|
const char16_t *mBuffer;
|
|
uint32_t mOffset;
|
|
uint32_t mCount;
|
|
|
|
uint32_t mLineNumber;
|
|
uint32_t mLineOffset;
|
|
|
|
uint32_t mTokenLineNumber;
|
|
uint32_t mTokenLineOffset;
|
|
uint32_t mTokenOffset;
|
|
|
|
uint32_t mRecordStartOffset;
|
|
EOFCharacters mEOFCharacters;
|
|
|
|
mozilla::css::ErrorReporter *mReporter;
|
|
|
|
// True if we are in SVG mode; false in "normal" CSS
|
|
bool mSVGMode;
|
|
bool mRecording;
|
|
bool mSeenBadToken;
|
|
bool mSeenVariableReference;
|
|
};
|
|
|
|
// Token for the grid-template-areas micro-syntax
|
|
// http://dev.w3.org/csswg/css-grid/#propdef-grid-template-areas
|
|
struct MOZ_STACK_CLASS nsCSSGridTemplateAreaToken {
|
|
nsAutoString mName; // Empty for a null cell, non-empty for a named cell
|
|
bool isTrash; // True for a trash token, mName is ignored in this case.
|
|
};
|
|
|
|
// Scanner for the grid-template-areas micro-syntax
|
|
class nsCSSGridTemplateAreaScanner {
|
|
public:
|
|
explicit nsCSSGridTemplateAreaScanner(const nsAString& aBuffer);
|
|
|
|
// Get the next token. Return false on EOF.
|
|
// aTokenResult is filled in with the data for the token.
|
|
bool Next(nsCSSGridTemplateAreaToken& aTokenResult);
|
|
|
|
private:
|
|
const char16_t *mBuffer;
|
|
uint32_t mOffset;
|
|
uint32_t mCount;
|
|
};
|
|
|
|
#endif /* nsCSSScanner_h___ */
|