gecko-dev/dom/webidl/CSSLexer.webidl
Boris Zbarsky 63c6b08058 Bug 790997. Align our tokenization of CSS bad-url-token with the CSS Syntax Level 3 CR. r=heycam,tromey
The main change is that once we discover we have a bad-url-token we consume
everything up to, but not including, the next ')' character.  While we do this
we can cross line boundaries and don't bother about matching braces or quotes.
We just keep going until we find the ')' or hit EOF.
2016-07-13 22:54:20 -04:00

171 lines
5.9 KiB
Plaintext

/* -*- Mode: IDL; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// The possible values for CSSToken.tokenType.
enum CSSTokenType {
// Whitespace.
"whitespace",
// A CSS comment.
"comment",
// An identifier. |text| holds the identifier text.
"ident",
// A function token. |text| holds the function name. Note that the
// function token includes (i.e., consumes) the "(" -- but this is
// not included in |text|.
"function",
// "@word". |text| holds "word", without the "@".
"at",
// "#word". |text| holds "word", without the "#".
"id",
// "#word". ID is used when "word" would have been a valid IDENT
// token without the "#"; otherwise, HASH is used.
"hash",
// A number.
"number",
// A dimensioned number.
"dimension",
// A percentage.
"percentage",
// A string.
"string",
// A "bad string". This can only be returned when a string is
// unterminated at EOF. (However, currently the lexer returns
// ordinary STRING tokens in this situation.)
"bad_string",
// A URL. |text| holds the URL.
"url",
// A "bad URL". This is a URL that either contains a bad_string or contains
// garbage after the string or unquoted URL test. |text| holds the URL and
// potentially whatever garbage came after it, up to but not including the
// following ')'.
"bad_url",
// A "symbol" is any one-character symbol. This corresponds to the
// DELIM token in the CSS specification.
"symbol",
// The "~=" token.
"includes",
// The "|=" token.
"dashmatch",
// The "^=" token.
"beginsmatch",
// The "$=" token.
"endsmatch",
// The "*=" token.
"containsmatch",
// A unicode-range token. This is currently not fully represented
// by CSSToken.
"urange",
// HTML comment delimiters, either "<!--" or "-->". Note that each
// is emitted as a separate token, and the intervening text is lexed
// as normal; whereas ordinary CSS comments are lexed as a unit.
"htmlcomment"
};
dictionary CSSToken {
// The token type.
CSSTokenType tokenType = "whitespace";
// Offset of the first character of the token.
unsigned long startOffset = 0;
// Offset of the character after the final character of the token.
// This is chosen so that the offsets can be passed to |substring|
// to yield the exact contents of the token.
unsigned long endOffset = 0;
// If the token is a number, percentage, or dimension, this holds
// the value. This is not present for other token types.
double number;
// If the token is a number, percentage, or dimension, this is true
// iff the number had an explicit sign. This is not present for
// other token types.
boolean hasSign;
// If the token is a number, percentage, or dimension, this is true
// iff the number was specified as an integer. This is not present
// for other token types.
boolean isInteger;
// Text associated with the token. This is not present for all
// token types. In particular it is:
//
// Token type Meaning
// ===============================
// ident The identifier.
// function The function name. Note that the "(" is part
// of the token but is not present in |text|.
// at The word.
// id The word.
// hash The word.
// dimension The dimension.
// string The string contents after escape processing.
// bad_string Ditto.
// url The URL after escape processing.
// bad_url Ditto.
// symbol The symbol text.
DOMString text;
};
/**
* CSSLexer is an interface to the CSS lexer. It tokenizes an
* input stream and returns CSS tokens.
*
* @see inIDOMUtils.getCSSLexer to create an instance of the lexer.
*/
[ChromeOnly]
interface CSSLexer
{
/**
* The line number of the most recently returned token. Line
* numbers are 0-based.
*/
readonly attribute unsigned long lineNumber;
/**
* The column number of the most recently returned token. Column
* numbers are 0-based.
*/
readonly attribute unsigned long columnNumber;
/**
* When EOF is reached, the last token might be unterminated in some
* ways. This method takes an input string and appends the needed
* terminators. In particular:
*
* 1. If EOF occurs mid-string, this will append the correct quote.
* 2. If EOF occurs in a url token, this will append the close paren.
* 3. If EOF occurs in a comment this will append the comment closer.
*
* A trailing backslash might also have been present in the input
* string. This is handled in different ways, depending on the
* context and arguments.
*
* If preserveBackslash is true, then the existing backslash at the
* end of inputString is preserved, and a new backslash is appended.
* That is, the input |\| is transformed to |\\|, and the
* input |'\| is transformed to |'\\'|.
*
* Otherwise, preserveBackslash is false:
* If the backslash appears in a string context, then the trailing
* backslash is dropped from inputString. That is, |"\| is
* transformed to |""|.
* If the backslash appears outside of a string context, then
* U+FFFD is appended. That is, |\| is transformed to a string
* with two characters: backslash followed by U+FFFD.
*
* Passing false for preserveBackslash makes the result conform to
* the CSS Syntax specification. However, passing true may give
* somewhat more intuitive behavior.
*
* @param inputString the input string
* @param preserveBackslash how to handle trailing backslashes
* @return the input string with the termination characters appended
*/
DOMString performEOFFixup(DOMString inputString, boolean preserveBackslash);
/**
* Return the next token, or null at EOF.
*/
CSSToken? nextToken();
};