gecko-dev/parser/html/javasrc/Tokenizer.java
2016-10-19 10:38:20 +03:00

7068 lines
309 KiB
Java

/*
* Copyright (c) 2005-2007 Henri Sivonen
* Copyright (c) 2007-2015 Mozilla Foundation
* Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
* Foundation, and Opera Software ASA.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/*
* The comments following this one that use the same comment syntax as this
* comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
* amended as of June 18 2008 and May 31 2010.
* That document came with this statement:
* "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
* Opera Software ASA. You are granted a license to use, reproduce and
* create derivative works of this document."
*/
package nu.validator.htmlparser.impl;
import org.xml.sax.ErrorHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import nu.validator.htmlparser.annotation.Auto;
import nu.validator.htmlparser.annotation.CharacterName;
import nu.validator.htmlparser.annotation.Const;
import nu.validator.htmlparser.annotation.Inline;
import nu.validator.htmlparser.annotation.Local;
import nu.validator.htmlparser.annotation.NoLength;
import nu.validator.htmlparser.common.EncodingDeclarationHandler;
import nu.validator.htmlparser.common.Interner;
import nu.validator.htmlparser.common.TokenHandler;
import nu.validator.htmlparser.common.XmlViolationPolicy;
/**
* An implementation of
* https://html.spec.whatwg.org/multipage/syntax.html#tokenization
*
* This class implements the <code>Locator</code> interface. This is not an
* incidental implementation detail: Users of this class are encouraged to make
* use of the <code>Locator</code> nature.
*
* By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
* can be configured to treat these conditions as fatal or to coerce the infoset
* to something that XML 1.0 allows.
*
* @version $Id$
* @author hsivonen
*/
public class Tokenizer implements Locator {
private static final int DATA_AND_RCDATA_MASK = ~1;
public static final int DATA = 0;
public static final int RCDATA = 1;
public static final int SCRIPT_DATA = 2;
public static final int RAWTEXT = 3;
public static final int SCRIPT_DATA_ESCAPED = 4;
public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
public static final int PLAINTEXT = 8;
public static final int TAG_OPEN = 9;
public static final int CLOSE_TAG_OPEN = 10;
public static final int TAG_NAME = 11;
public static final int BEFORE_ATTRIBUTE_NAME = 12;
public static final int ATTRIBUTE_NAME = 13;
public static final int AFTER_ATTRIBUTE_NAME = 14;
public static final int BEFORE_ATTRIBUTE_VALUE = 15;
public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
public static final int BOGUS_COMMENT = 17;
public static final int MARKUP_DECLARATION_OPEN = 18;
public static final int DOCTYPE = 19;
public static final int BEFORE_DOCTYPE_NAME = 20;
public static final int DOCTYPE_NAME = 21;
public static final int AFTER_DOCTYPE_NAME = 22;
public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
public static final int BOGUS_DOCTYPE = 31;
public static final int COMMENT_START = 32;
public static final int COMMENT_START_DASH = 33;
public static final int COMMENT = 34;
public static final int COMMENT_END_DASH = 35;
public static final int COMMENT_END = 36;
public static final int COMMENT_END_BANG = 37;
public static final int NON_DATA_END_TAG_NAME = 38;
public static final int MARKUP_DECLARATION_HYPHEN = 39;
public static final int MARKUP_DECLARATION_OCTYPE = 40;
public static final int DOCTYPE_UBLIC = 41;
public static final int DOCTYPE_YSTEM = 42;
public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
public static final int CONSUME_CHARACTER_REFERENCE = 46;
public static final int CONSUME_NCR = 47;
public static final int CHARACTER_REFERENCE_TAIL = 48;
public static final int HEX_NCR_LOOP = 49;
public static final int DECIMAL_NRC_LOOP = 50;
public static final int HANDLE_NCR_VALUE = 51;
public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
public static final int SELF_CLOSING_START_TAG = 54;
public static final int CDATA_START = 55;
public static final int CDATA_SECTION = 56;
public static final int CDATA_RSQB = 57;
public static final int CDATA_RSQB_RSQB = 58;
public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
public static final int SCRIPT_DATA_ESCAPE_START = 60;
public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
public static final int BOGUS_COMMENT_HYPHEN = 64;
public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
public static final int PROCESSING_INSTRUCTION = 73;
public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
/**
* Magic value for UTF-16 operations.
*/
private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
/**
* UTF-16 code unit array containing less than and greater than for emitting
* those characters on certain parse errors.
*/
private static final @NoLength char[] LT_GT = { '<', '>' };
/**
* UTF-16 code unit array containing less than and solidus for emitting
* those characters on certain parse errors.
*/
private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
/**
* UTF-16 code unit array containing ]] for emitting those characters on
* state transitions.
*/
private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
/**
* Array version of U+FFFD.
*/
private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
// [NOCPP[
/**
* Array version of space.
*/
private static final @NoLength char[] SPACE = { ' ' };
// ]NOCPP]
/**
* Array version of line feed.
*/
private static final @NoLength char[] LF = { '\n' };
/**
* "CDATA[" as <code>char[]</code>
*/
private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
'A', '[' };
/**
* "octype" as <code>char[]</code>
*/
private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
'e' };
/**
* "ublic" as <code>char[]</code>
*/
private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
/**
* "ystem" as <code>char[]</code>
*/
private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
'e', 'x', 't' };
private static final char[] XMP_ARR = { 'x', 'm', 'p' };
private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
'e', 'a' };
private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
'd' };
private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
'p', 't' };
private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
'e', 's' };
/**
* The token handler.
*/
protected final TokenHandler tokenHandler;
protected EncodingDeclarationHandler encodingDeclarationHandler;
// [NOCPP[
/**
* The error handler.
*/
protected ErrorHandler errorHandler;
// ]NOCPP]
/**
* Whether the previous char read was CR.
*/
protected boolean lastCR;
protected int stateSave;
private int returnStateSave;
protected int index;
private boolean forceQuirks;
private char additional;
private int entCol;
private int firstCharKey;
private int lo;
private int hi;
private int candidate;
private int charRefBufMark;
protected int value;
private boolean seenDigits;
protected int cstart;
/**
* The SAX public id for the resource being tokenized. (Only passed to back
* as part of locator data.)
*/
private String publicId;
/**
* The SAX system id for the resource being tokenized. (Only passed to back
* as part of locator data.)
*/
private String systemId;
/**
* Buffer for bufferable things other than those that fit the description
* of <code>charRefBuf</code>.
*/
private @Auto char[] strBuf;
/**
* Number of significant <code>char</code>s in <code>strBuf</code>.
*/
private int strBufLen;
/**
* Buffer for characters that might form a character reference but may
* end up not forming one.
*/
private final @Auto char[] charRefBuf;
/**
* Number of significant <code>char</code>s in <code>charRefBuf</code>.
*/
private int charRefBufLen;
/**
* Buffer for expanding NCRs falling into the Basic Multilingual Plane.
*/
private final @Auto char[] bmpChar;
/**
* Buffer for expanding astral NCRs.
*/
private final @Auto char[] astralChar;
/**
* The element whose end tag closes the current CDATA or RCDATA element.
*/
protected ElementName endTagExpectation = null;
private char[] endTagExpectationAsArray; // not @Auto!
/**
* <code>true</code> if tokenizing an end tag
*/
protected boolean endTag;
/**
* The current tag token name.
*/
private ElementName tagName = null;
/**
* The current attribute name.
*/
protected AttributeName attributeName = null;
// [NOCPP[
/**
* Whether comment tokens are emitted.
*/
private boolean wantsComments = false;
/**
* <code>true</code> when HTML4-specific additional errors are requested.
*/
protected boolean html4;
/**
* Whether the stream is past the first 1024 bytes.
*/
private boolean metaBoundaryPassed;
// ]NOCPP]
/**
* The name of the current doctype token.
*/
private @Local String doctypeName;
/**
* The public id of the current doctype token.
*/
private String publicIdentifier;
/**
* The system id of the current doctype token.
*/
private String systemIdentifier;
/**
* The attribute holder.
*/
private HtmlAttributes attributes;
// [NOCPP[
/**
* The policy for vertical tab and form feed.
*/
private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
/**
* The policy for comments.
*/
private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
private boolean html4ModeCompatibleWithXhtml1Schemata;
private int mappingLangToXmlLang;
// ]NOCPP]
private final boolean newAttributesEachTime;
private boolean shouldSuspend;
protected boolean confident;
private int line;
/*
* The line number of the current attribute. First set to the line of the
* attribute name and if there is a value, set to the line the value
* started on.
*/
// CPPONLY: private int attributeLine;
private Interner interner;
// CPPONLY: private boolean viewingXmlSource;
// [NOCPP[
protected LocatorImpl ampersandLocation;
public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
this.tokenHandler = tokenHandler;
this.encodingDeclarationHandler = null;
this.newAttributesEachTime = newAttributesEachTime;
// &CounterClockwiseContourIntegral; is the longest valid char ref and
// the semicolon never gets appended to the buffer.
this.charRefBuf = new char[32];
this.bmpChar = new char[1];
this.astralChar = new char[2];
this.tagName = null;
this.attributeName = null;
this.doctypeName = null;
this.publicIdentifier = null;
this.systemIdentifier = null;
this.attributes = null;
}
// ]NOCPP]
/**
* The constructor.
*
* @param tokenHandler
* the handler for receiving tokens
*/
public Tokenizer(TokenHandler tokenHandler
// CPPONLY: , boolean viewingXmlSource
) {
this.tokenHandler = tokenHandler;
this.encodingDeclarationHandler = null;
// [NOCPP[
this.newAttributesEachTime = false;
// ]NOCPP]
// &CounterClockwiseContourIntegral; is the longest valid char ref and
// the semicolon never gets appended to the buffer.
this.charRefBuf = new char[32];
this.bmpChar = new char[1];
this.astralChar = new char[2];
this.tagName = null;
this.attributeName = null;
this.doctypeName = null;
this.publicIdentifier = null;
this.systemIdentifier = null;
// [NOCPP[
this.attributes = null;
// ]NOCPP]
// CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
// CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
// CPPONLY: this.viewingXmlSource = viewingXmlSource;
}
public void setInterner(Interner interner) {
this.interner = interner;
}
public void initLocation(String newPublicId, String newSystemId) {
this.systemId = newSystemId;
this.publicId = newPublicId;
}
// CPPONLY: boolean isViewingXmlSource() {
// CPPONLY: return viewingXmlSource;
// CPPONLY: }
// [NOCPP[
/**
* Returns the mappingLangToXmlLang.
*
* @return the mappingLangToXmlLang
*/
public boolean isMappingLangToXmlLang() {
return mappingLangToXmlLang == AttributeName.HTML_LANG;
}
/**
* Sets the mappingLangToXmlLang.
*
* @param mappingLangToXmlLang
* the mappingLangToXmlLang to set
*/
public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
: AttributeName.HTML;
}
/**
* Sets the error handler.
*
* @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
*/
public void setErrorHandler(ErrorHandler eh) {
this.errorHandler = eh;
}
public ErrorHandler getErrorHandler() {
return this.errorHandler;
}
/**
* Sets the commentPolicy.
*
* @param commentPolicy
* the commentPolicy to set
*/
public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
this.commentPolicy = commentPolicy;
}
/**
* Sets the contentNonXmlCharPolicy.
*
* @param contentNonXmlCharPolicy
* the contentNonXmlCharPolicy to set
*/
public void setContentNonXmlCharPolicy(
XmlViolationPolicy contentNonXmlCharPolicy) {
if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
throw new IllegalArgumentException(
"Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
}
}
/**
* Sets the contentSpacePolicy.
*
* @param contentSpacePolicy
* the contentSpacePolicy to set
*/
public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
this.contentSpacePolicy = contentSpacePolicy;
}
/**
* Sets the xmlnsPolicy.
*
* @param xmlnsPolicy
* the xmlnsPolicy to set
*/
public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
throw new IllegalArgumentException("Can't use FATAL here.");
}
this.xmlnsPolicy = xmlnsPolicy;
}
public void setNamePolicy(XmlViolationPolicy namePolicy) {
this.namePolicy = namePolicy;
}
/**
* Sets the html4ModeCompatibleWithXhtml1Schemata.
*
* @param html4ModeCompatibleWithXhtml1Schemata
* the html4ModeCompatibleWithXhtml1Schemata to set
*/
public void setHtml4ModeCompatibleWithXhtml1Schemata(
boolean html4ModeCompatibleWithXhtml1Schemata) {
this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
}
// ]NOCPP]
// For the token handler to call
/**
* Sets the tokenizer state and the associated element name. This should
* only ever used to put the tokenizer into one of the states that have
* a special end tag expectation.
*
* @param specialTokenizerState
* the tokenizer state to set
* @param endTagExpectation
* the expected end tag for transitioning back to normal
*/
public void setStateAndEndTagExpectation(int specialTokenizerState,
@Local String endTagExpectation) {
this.stateSave = specialTokenizerState;
if (specialTokenizerState == Tokenizer.DATA) {
return;
}
@Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
asArray.length, interner);
endTagExpectationToArray();
}
/**
* Sets the tokenizer state and the associated element name. This should
* only ever used to put the tokenizer into one of the states that have
* a special end tag expectation.
*
* @param specialTokenizerState
* the tokenizer state to set
* @param endTagExpectation
* the expected end tag for transitioning back to normal
*/
public void setStateAndEndTagExpectation(int specialTokenizerState,
ElementName endTagExpectation) {
this.stateSave = specialTokenizerState;
this.endTagExpectation = endTagExpectation;
endTagExpectationToArray();
}
private void endTagExpectationToArray() {
switch (endTagExpectation.getGroup()) {
case TreeBuilder.TITLE:
endTagExpectationAsArray = TITLE_ARR;
return;
case TreeBuilder.SCRIPT:
endTagExpectationAsArray = SCRIPT_ARR;
return;
case TreeBuilder.STYLE:
endTagExpectationAsArray = STYLE_ARR;
return;
case TreeBuilder.PLAINTEXT:
endTagExpectationAsArray = PLAINTEXT_ARR;
return;
case TreeBuilder.XMP:
endTagExpectationAsArray = XMP_ARR;
return;
case TreeBuilder.TEXTAREA:
endTagExpectationAsArray = TEXTAREA_ARR;
return;
case TreeBuilder.IFRAME:
endTagExpectationAsArray = IFRAME_ARR;
return;
case TreeBuilder.NOEMBED:
endTagExpectationAsArray = NOEMBED_ARR;
return;
case TreeBuilder.NOSCRIPT:
endTagExpectationAsArray = NOSCRIPT_ARR;
return;
case TreeBuilder.NOFRAMES:
endTagExpectationAsArray = NOFRAMES_ARR;
return;
default:
assert false: "Bad end tag expectation.";
return;
}
}
/**
* For C++ use only.
*/
public void setLineNumber(int line) {
// CPPONLY: this.attributeLine = line; // XXX is this needed?
this.line = line;
}
// start Locator impl
/**
* @see org.xml.sax.Locator#getLineNumber()
*/
@Inline public int getLineNumber() {
return line;
}
// [NOCPP[
/**
* @see org.xml.sax.Locator#getColumnNumber()
*/
@Inline public int getColumnNumber() {
return -1;
}
/**
* @see org.xml.sax.Locator#getPublicId()
*/
public String getPublicId() {
return publicId;
}
/**
* @see org.xml.sax.Locator#getSystemId()
*/
public String getSystemId() {
return systemId;
}
// end Locator impl
// end public API
public void notifyAboutMetaBoundary() {
metaBoundaryPassed = true;
}
void turnOnAdditionalHtml4Errors() {
html4 = true;
}
// ]NOCPP]
HtmlAttributes emptyAttributes() {
// [NOCPP[
if (newAttributesEachTime) {
return new HtmlAttributes(mappingLangToXmlLang);
} else {
// ]NOCPP]
return HtmlAttributes.EMPTY_ATTRIBUTES;
// [NOCPP[
}
// ]NOCPP]
}
@Inline private void appendCharRefBuf(char c) {
// CPPONLY: assert charRefBufLen < charRefBuf.length:
// CPPONLY: "RELEASE: Attempted to overrun charRefBuf!";
charRefBuf[charRefBufLen++] = c;
}
private void emitOrAppendCharRefBuf(int returnState) throws SAXException {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendCharRefBufToStrBuf();
} else {
if (charRefBufLen > 0) {
tokenHandler.characters(charRefBuf, 0, charRefBufLen);
charRefBufLen = 0;
}
}
}
@Inline private void clearStrBufAfterUse() {
strBufLen = 0;
}
@Inline private void clearStrBufBeforeUse() {
assert strBufLen == 0: "strBufLen not reset after previous use!";
strBufLen = 0; // no-op in the absence of bugs
}
@Inline private void clearStrBufAfterOneHyphen() {
assert strBufLen == 1: "strBufLen length not one!";
assert strBuf[0] == '-': "strBuf does not start with a hyphen!";
strBufLen = 0;
}
/**
* Appends to the buffer.
*
* @param c
* the UTF-16 code unit to append
*/
@Inline private void appendStrBuf(char c) {
// CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient.";
// CPPONLY: if (strBufLen == strBuf.length) {
// CPPONLY: if (!EnsureBufferSpace(1)) {
// CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure";
// CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not
// CPPONLY: }
strBuf[strBufLen++] = c;
}
/**
* The buffer as a String. Currently only used for error reporting.
*
* <p>
* C++ memory note: The return value must be released.
*
* @return the buffer as a string
*/
protected String strBufToString() {
String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen
// CPPONLY: , tokenHandler
);
clearStrBufAfterUse();
return str;
}
/**
* Returns the buffer as a local name. The return value is released in
* emitDoctypeToken().
*
* @return the buffer as local name
*/
private void strBufToDoctypeName() {
doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
interner);
clearStrBufAfterUse();
}
/**
* Emits the buffer as character tokens.
*
* @throws SAXException
* if the token handler threw
*/
private void emitStrBuf() throws SAXException {
if (strBufLen > 0) {
tokenHandler.characters(strBuf, 0, strBufLen);
clearStrBufAfterUse();
}
}
@Inline private void appendSecondHyphenToBogusComment() throws SAXException {
// [NOCPP[
switch (commentPolicy) {
case ALTER_INFOSET:
appendStrBuf(' ');
// FALLTHROUGH
case ALLOW:
warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
// ]NOCPP]
appendStrBuf('-');
// [NOCPP[
break;
case FATAL:
fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
break;
}
// ]NOCPP]
}
// [NOCPP[
private void maybeAppendSpaceToBogusComment() throws SAXException {
switch (commentPolicy) {
case ALTER_INFOSET:
appendStrBuf(' ');
// FALLTHROUGH
case ALLOW:
warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
break;
case FATAL:
fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
break;
}
}
// ]NOCPP]
@Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
throws SAXException {
errConsecutiveHyphens();
// [NOCPP[
switch (commentPolicy) {
case ALTER_INFOSET:
strBufLen--;
// WARNING!!! This expands the worst case of the buffer length
// given the length of input!
appendStrBuf(' ');
appendStrBuf('-');
// FALLTHROUGH
case ALLOW:
warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
// ]NOCPP]
appendStrBuf(c);
// [NOCPP[
break;
case FATAL:
fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
break;
}
// ]NOCPP]
}
private void appendStrBuf(@NoLength char[] buffer, int offset, int length) {
int newLen = strBufLen + length;
// CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient.";
// CPPONLY: if (strBuf.length < newLen) {
// CPPONLY: if (!EnsureBufferSpace(length)) {
// CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure";
// CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not
// CPPONLY: }
System.arraycopy(buffer, offset, strBuf, strBufLen, length);
strBufLen = newLen;
}
/**
* Append the contents of the char reference buffer to the main one.
*/
@Inline private void appendCharRefBufToStrBuf() {
appendStrBuf(charRefBuf, 0, charRefBufLen);
charRefBufLen = 0;
}
/**
* Emits the current comment token.
*
* @param pos
* TODO
*
* @throws SAXException
*/
private void emitComment(int provisionalHyphens, int pos)
throws SAXException {
// [NOCPP[
if (wantsComments) {
// ]NOCPP]
tokenHandler.comment(strBuf, 0, strBufLen
- provisionalHyphens);
// [NOCPP[
}
// ]NOCPP]
clearStrBufAfterUse();
cstart = pos + 1;
}
/**
* Flushes coalesced character tokens.
*
* @param buf
* TODO
* @param pos
* TODO
*
* @throws SAXException
*/
protected void flushChars(@NoLength char[] buf, int pos)
throws SAXException {
if (pos > cstart) {
tokenHandler.characters(buf, cstart, pos - cstart);
}
cstart = Integer.MAX_VALUE;
}
/**
* Reports an condition that would make the infoset incompatible with XML
* 1.0 as fatal.
*
* @param message
* the message
* @throws SAXException
* @throws SAXParseException
*/
public void fatal(String message) throws SAXException {
SAXParseException spe = new SAXParseException(message, this);
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
/**
* Reports a Parse Error.
*
* @param message
* the message
* @throws SAXException
*/
public void err(String message) throws SAXException {
if (errorHandler == null) {
return;
}
SAXParseException spe = new SAXParseException(message, this);
errorHandler.error(spe);
}
public void errTreeBuilder(String message) throws SAXException {
ErrorHandler eh = null;
if (tokenHandler instanceof TreeBuilder<?>) {
TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
eh = treeBuilder.getErrorHandler();
}
if (eh == null) {
eh = errorHandler;
}
if (eh == null) {
return;
}
SAXParseException spe = new SAXParseException(message, this);
eh.error(spe);
}
/**
* Reports a warning
*
* @param message
* the message
* @throws SAXException
*/
public void warn(String message) throws SAXException {
if (errorHandler == null) {
return;
}
SAXParseException spe = new SAXParseException(message, this);
errorHandler.warning(spe);
}
private void strBufToElementNameString() {
tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
interner);
clearStrBufAfterUse();
}
private int emitCurrentTagToken(boolean selfClosing, int pos)
throws SAXException {
cstart = pos + 1;
maybeErrSlashInEndTag(selfClosing);
stateSave = Tokenizer.DATA;
HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
: attributes);
if (endTag) {
/*
* When an end tag token is emitted, the content model flag must be
* switched to the PCDATA state.
*/
maybeErrAttributesOnEndTag(attrs);
// CPPONLY: if (!viewingXmlSource) {
tokenHandler.endTag(tagName);
// CPPONLY: }
// CPPONLY: if (newAttributesEachTime) {
// CPPONLY: Portability.delete(attributes);
// CPPONLY: attributes = null;
// CPPONLY: }
} else {
// CPPONLY: if (viewingXmlSource) {
// CPPONLY: assert newAttributesEachTime;
// CPPONLY: Portability.delete(attributes);
// CPPONLY: attributes = null;
// CPPONLY: } else {
tokenHandler.startTag(tagName, attrs, selfClosing);
// CPPONLY: }
}
tagName.release();
tagName = null;
if (newAttributesEachTime) {
attributes = null;
} else {
attributes.clear(mappingLangToXmlLang);
}
/*
* The token handler may have called setStateAndEndTagExpectation
* and changed stateSave since the start of this method.
*/
return stateSave;
}
private void attributeNameComplete() throws SAXException {
attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
// [NOCPP[
, namePolicy != XmlViolationPolicy.ALLOW
// ]NOCPP]
, interner);
clearStrBufAfterUse();
if (attributes == null) {
attributes = new HtmlAttributes(mappingLangToXmlLang);
}
/*
* When the user agent leaves the attribute name state (and before
* emitting the tag token, if appropriate), the complete attribute's
* name must be compared to the other attributes on the same token; if
* there is already an attribute on the token with the exact same name,
* then this is a parse error and the new attribute must be dropped,
* along with the value that gets associated with it (if any).
*/
if (attributes.contains(attributeName)) {
errDuplicateAttribute();
attributeName.release();
attributeName = null;
}
}
private void addAttributeWithoutValue() throws SAXException {
noteAttributeWithoutValue();
// [NOCPP[
if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
&& ElementName.META == tagName) {
err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
}
// ]NOCPP]
if (attributeName != null) {
// [NOCPP[
if (html4) {
if (attributeName.isBoolean()) {
if (html4ModeCompatibleWithXhtml1Schemata) {
attributes.addAttribute(attributeName,
attributeName.getLocal(AttributeName.HTML),
xmlnsPolicy);
} else {
attributes.addAttribute(attributeName, "", xmlnsPolicy);
}
} else {
if (AttributeName.BORDER != attributeName) {
err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
attributes.addAttribute(attributeName, "", xmlnsPolicy);
}
}
} else {
if (AttributeName.SRC == attributeName
|| AttributeName.HREF == attributeName) {
warn("Attribute \u201C"
+ attributeName.getLocal(AttributeName.HTML)
+ "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
}
// ]NOCPP]
attributes.addAttribute(attributeName,
Portability.newEmptyString()
// [NOCPP[
, xmlnsPolicy
// ]NOCPP]
// CPPONLY: , attributeLine
);
// [NOCPP[
}
// ]NOCPP]
attributeName = null; // attributeName has been adopted by the
// |attributes| object
} else {
clearStrBufAfterUse();
}
}
private void addAttributeWithValue() throws SAXException {
// [NOCPP[
if (metaBoundaryPassed && ElementName.META == tagName
&& AttributeName.CHARSET == attributeName) {
err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
}
// ]NOCPP]
if (attributeName != null) {
String val = strBufToString(); // Ownership transferred to
// HtmlAttributes
// CPPONLY: if (mViewSource) {
// CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
// CPPONLY: }
// [NOCPP[
if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
&& attributeName.isCaseFolded()) {
val = newAsciiLowerCaseStringFromString(val);
}
// ]NOCPP]
attributes.addAttribute(attributeName, val
// [NOCPP[
, xmlnsPolicy
// ]NOCPP]
// CPPONLY: , attributeLine
);
attributeName = null; // attributeName has been adopted by the
// |attributes| object
} else {
// We have a duplicate attribute. Explicitly discard its value.
clearStrBufAfterUse();
}
}
// [NOCPP[
private static String newAsciiLowerCaseStringFromString(String str) {
if (str == null) {
return null;
}
char[] buf = new char[str.length()];
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c >= 'A' && c <= 'Z') {
c += 0x20;
}
buf[i] = c;
}
return new String(buf);
}
protected void startErrorReporting() throws SAXException {
}
// ]NOCPP]
public void start() throws SAXException {
initializeWithoutStarting();
tokenHandler.startTokenization(this);
// [NOCPP[
startErrorReporting();
// ]NOCPP]
}
public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
int state = stateSave;
int returnState = returnStateSave;
char c = '\u0000';
shouldSuspend = false;
lastCR = false;
int start = buffer.getStart();
int end = buffer.getEnd();
// In C++, the caller of tokenizeBuffer needs to do this explicitly.
// [NOCPP[
ensureBufferSpace(end - start);
// ]NOCPP]
/**
* The index of the last <code>char</code> read from <code>buf</code>.
*/
int pos = start - 1;
/**
* The index of the first <code>char</code> in <code>buf</code> that is
* part of a coalesced run of character tokens or
* <code>Integer.MAX_VALUE</code> if there is not a current run being
* coalesced.
*/
switch (state) {
case DATA:
case RCDATA:
case SCRIPT_DATA:
case PLAINTEXT:
case RAWTEXT:
case CDATA_SECTION:
case SCRIPT_DATA_ESCAPED:
case SCRIPT_DATA_ESCAPE_START:
case SCRIPT_DATA_ESCAPE_START_DASH:
case SCRIPT_DATA_ESCAPED_DASH:
case SCRIPT_DATA_ESCAPED_DASH_DASH:
case SCRIPT_DATA_DOUBLE_ESCAPE_START:
case SCRIPT_DATA_DOUBLE_ESCAPED:
case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
case SCRIPT_DATA_DOUBLE_ESCAPE_END:
cstart = start;
break;
default:
cstart = Integer.MAX_VALUE;
break;
}
/**
* The number of <code>char</code>s in <code>buf</code> that have
* meaning. (The rest of the array is garbage and should not be
* examined.)
*/
// CPPONLY: if (mViewSource) {
// CPPONLY: mViewSource.SetBuffer(buffer);
// CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
// CPPONLY: } else {
// CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: }
// [NOCPP[
pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
end);
// ]NOCPP]
if (pos == end) {
// exiting due to end of buffer
buffer.setStart(pos);
} else {
buffer.setStart(pos + 1);
}
return lastCR;
}
// [NOCPP[
private void ensureBufferSpace(int inputLength) throws SAXException {
// Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
// Adding to the general worst case instead of only the
// TreeBuilder-exposed worst case to avoid re-introducing a bug when
// unifying the tokenizer and tree builder buffers in the future.
int worstCase = strBufLen + inputLength + charRefBufLen + 2;
tokenHandler.ensureBufferSpace(worstCase);
if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) {
// When altering infoset, if the comment contents are consecutive
// hyphens, each hyphen generates a space, too. These buffer
// contents never get emitted as characters() to the tokenHandler,
// which is why this calculation happens after the call to
// ensureBufferSpace on tokenHandler.
worstCase *= 2;
}
if (strBuf == null) {
// Add an arbitrary small value to avoid immediate reallocation
// once there are a few characters in the buffer.
strBuf = new char[worstCase + 128];
} else if (worstCase > strBuf.length) {
// HotSpot reportedly allocates memory with 8-byte accuracy, so
// there's no point in trying to do math here to avoid slop.
// Maybe we should add some small constant to worstCase here
// but not doing that without profiling. In C++ with jemalloc,
// the corresponding method should do math to round up here
// to avoid slop.
char[] newBuf = new char[worstCase];
System.arraycopy(strBuf, 0, newBuf, 0, strBufLen);
strBuf = newBuf;
}
}
// ]NOCPP]
@SuppressWarnings("unused") private int stateLoop(int state, char c,
int pos, @NoLength char[] buf, boolean reconsume, int returnState,
int endPos) throws SAXException {
/*
* Idioms used in this code:
*
*
* Consuming the next input character
*
* To consume the next input character, the code does this: if (++pos ==
* endPos) { break stateloop; } c = checkChar(buf, pos);
*
*
* Staying in a state
*
* When there's a state that the tokenizer may stay in over multiple
* input characters, the state has a wrapper |for(;;)| loop and staying
* in the state continues the loop.
*
*
* Switching to another state
*
* To switch to another state, the code sets the state variable to the
* magic number of the new state. Then it either continues stateloop or
* breaks out of the state's own wrapper loop if the target state is
* right after the current state in source order. (This is a partial
* workaround for Java's lack of goto.)
*
*
* Reconsume support
*
* The spec sometimes says that an input character is reconsumed in
* another state. If a state can ever be entered so that an input
* character can be reconsumed in it, the state's code starts with an
* |if (reconsume)| that sets reconsume to false and skips over the
* normal code for consuming a new character.
*
* To reconsume the current character in another state, the code sets
* |reconsume| to true and then switches to the other state.
*
*
* Emitting character tokens
*
* This method emits character tokens lazily. Whenever a new range of
* character tokens starts, the field cstart must be set to the start
* index of the range. The flushChars() method must be called at the end
* of a range to flush it.
*
*
* U+0000 handling
*
* The various states have to handle the replacement of U+0000 with
* U+FFFD. However, if U+0000 would be reconsumed in another state, the
* replacement doesn't need to happen, because it's handled by the
* reconsuming state.
*
*
* LF handling
*
* Every state needs to increment the line number upon LF unless the LF
* gets reconsumed by another state which increments the line number.
*
*
* CR handling
*
* Every state needs to handle CR unless the CR gets reconsumed and is
* handled by the reconsuming state. The CR needs to be handled as if it
* were and LF, the lastCR field must be set to true and then this
* method must return. The IO driver will then swallow the next
* character if it is an LF to coalesce CRLF.
*/
stateloop: for (;;) {
switch (state) {
case DATA:
dataloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
switch (c) {
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in data state.
*/
flushChars(buf, pos);
assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
appendCharRefBuf(c);
setAdditionalAndRememberAmpersandLocation('\u0000');
returnState = state;
state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
continue stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the tag
* open state.
*/
flushChars(buf, pos);
state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
break dataloop; // FALL THROUGH continue
// stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the input character as a
* character token.
*
* Stay in the data state.
*/
continue;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case TAG_OPEN:
tagopenloop: for (;;) {
/*
* The behavior of this state depends on the content
* model flag.
*/
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* If the content model flag is set to the PCDATA state
* Consume the next input character:
*/
if (c >= 'A' && c <= 'Z') {
/*
* U+0041 LATIN CAPITAL LETTER A through to U+005A
* LATIN CAPITAL LETTER Z Create a new start tag
* token,
*/
endTag = false;
/*
* set its tag name to the lowercase version of the
* input character (add 0x0020 to the character's
* code point),
*/
clearStrBufBeforeUse();
appendStrBuf((char) (c + 0x20));
/* then switch to the tag name state. */
state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
/*
* (Don't emit the token yet; further details will
* be filled in before it is emitted.)
*/
break tagopenloop;
// continue stateloop;
} else if (c >= 'a' && c <= 'z') {
/*
* U+0061 LATIN SMALL LETTER A through to U+007A
* LATIN SMALL LETTER Z Create a new start tag
* token,
*/
endTag = false;
/*
* set its tag name to the input character,
*/
clearStrBufBeforeUse();
appendStrBuf(c);
/* then switch to the tag name state. */
state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
/*
* (Don't emit the token yet; further details will
* be filled in before it is emitted.)
*/
break tagopenloop;
// continue stateloop;
}
switch (c) {
case '!':
/*
* U+0021 EXCLAMATION MARK (!) Switch to the
* markup declaration open state.
*/
state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
continue stateloop;
case '/':
/*
* U+002F SOLIDUS (/) Switch to the close tag
* open state.
*/
state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
continue stateloop;
case '?':
// CPPONLY: if (viewingXmlSource) {
// CPPONLY: state = transition(state,
// CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
// CPPONLY: reconsume,
// CPPONLY: pos);
// CPPONLY: continue stateloop;
// CPPONLY: }
/*
* U+003F QUESTION MARK (?) Parse error.
*/
errProcessingInstruction();
/*
* Switch to the bogus comment state.
*/
clearStrBufBeforeUse();
appendStrBuf(c);
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Parse error.
*/
errLtGt();
/*
* Emit a U+003C LESS-THAN SIGN character token
* and a U+003E GREATER-THAN SIGN character
* token.
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
/* Switch to the data state. */
cstart = pos + 1;
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
default:
/*
* Anything else Parse error.
*/
errBadCharAfterLt(c);
/*
* Emit a U+003C LESS-THAN SIGN character token
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
/*
* and reconsume the current input character in
* the data state.
*/
cstart = pos;
reconsume = true;
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
}
}
// FALL THROUGH DON'T REORDER
case TAG_NAME:
tagnameloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
strBufToElementNameString();
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the before attribute name state.
*/
strBufToElementNameString();
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
break tagnameloop;
// continue stateloop;
case '/':
/*
* U+002F SOLIDUS (/) Switch to the self-closing
* start tag state.
*/
strBufToElementNameString();
state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* tag token.
*/
strBufToElementNameString();
state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
/*
* Switch to the data state.
*/
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
if (c >= 'A' && c <= 'Z') {
/*
* U+0041 LATIN CAPITAL LETTER A through to
* U+005A LATIN CAPITAL LETTER Z Append the
* lowercase version of the current input
* character (add 0x0020 to the character's
* code point) to the current tag token's
* tag name.
*/
c += 0x20;
}
/*
* Anything else Append the current input
* character to the current tag token's tag
* name.
*/
appendStrBuf(c);
/*
* Stay in the tag name state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case BEFORE_ATTRIBUTE_NAME:
beforeattributenameloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the before attribute name state.
*/
continue;
case '/':
/*
* U+002F SOLIDUS (/) Switch to the self-closing
* start tag state.
*/
state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* tag token.
*/
state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
/*
* Switch to the data state.
*/
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
case '\"':
case '\'':
case '<':
case '=':
/*
* U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
* (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
* SIGN (=) Parse error.
*/
errBadCharBeforeAttributeNameOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
default:
/*
* Anything else Start a new attribute in the
* current tag token.
*/
if (c >= 'A' && c <= 'Z') {
/*
* U+0041 LATIN CAPITAL LETTER A through to
* U+005A LATIN CAPITAL LETTER Z Set that
* attribute's name to the lowercase version
* of the current input character (add
* 0x0020 to the character's code point)
*/
c += 0x20;
}
// CPPONLY: attributeLine = line;
/*
* Set that attribute's name to the current
* input character,
*/
clearStrBufBeforeUse();
appendStrBuf(c);
/*
* and its value to the empty string.
*/
// Will do later.
/*
* Switch to the attribute name state.
*/
state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
break beforeattributenameloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case ATTRIBUTE_NAME:
attributenameloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
attributeNameComplete();
state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the after attribute name state.
*/
attributeNameComplete();
state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
continue stateloop;
case '/':
/*
* U+002F SOLIDUS (/) Switch to the self-closing
* start tag state.
*/
attributeNameComplete();
addAttributeWithoutValue();
state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
continue stateloop;
case '=':
/*
* U+003D EQUALS SIGN (=) Switch to the before
* attribute value state.
*/
attributeNameComplete();
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
break attributenameloop;
// continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* tag token.
*/
attributeNameComplete();
addAttributeWithoutValue();
state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
/*
* Switch to the data state.
*/
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
case '\"':
case '\'':
case '<':
/*
* U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
* (') U+003C LESS-THAN SIGN (<) Parse error.
*/
errQuoteOrLtInAttributeNameOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
default:
if (c >= 'A' && c <= 'Z') {
/*
* U+0041 LATIN CAPITAL LETTER A through to
* U+005A LATIN CAPITAL LETTER Z Append the
* lowercase version of the current input
* character (add 0x0020 to the character's
* code point) to the current attribute's
* name.
*/
c += 0x20;
}
/*
* Anything else Append the current input
* character to the current attribute's name.
*/
appendStrBuf(c);
/*
* Stay in the attribute name state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case BEFORE_ATTRIBUTE_VALUE:
beforeattributevalueloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the before attribute value state.
*/
continue;
case '"':
/*
* U+0022 QUOTATION MARK (") Switch to the
* attribute value (double-quoted) state.
*/
// CPPONLY: attributeLine = line;
clearStrBufBeforeUse();
state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
break beforeattributevalueloop;
// continue stateloop;
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the attribute
* value (unquoted) state and reconsume this
* input character.
*/
// CPPONLY: attributeLine = line;
clearStrBufBeforeUse();
reconsume = true;
state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
noteUnquotedAttributeValue();
continue stateloop;
case '\'':
/*
* U+0027 APOSTROPHE (') Switch to the attribute
* value (single-quoted) state.
*/
// CPPONLY: attributeLine = line;
clearStrBufBeforeUse();
state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Parse error.
*/
errAttributeValueMissing();
/*
* Emit the current tag token.
*/
addAttributeWithoutValue();
state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
/*
* Switch to the data state.
*/
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
case '<':
case '=':
case '`':
/*
* U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
* (=) U+0060 GRAVE ACCENT (`)
*/
errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
default:
// [NOCPP[
errHtml4NonNameInUnquotedAttribute(c);
// ]NOCPP]
/*
* Anything else Append the current input
* character to the current attribute's value.
*/
// CPPONLY: attributeLine = line;
clearStrBufBeforeUse();
appendStrBuf(c);
/*
* Switch to the attribute value (unquoted)
* state.
*/
state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
noteUnquotedAttributeValue();
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
attributevaluedoublequotedloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '"':
/*
* U+0022 QUOTATION MARK (") Switch to the after
* attribute value (quoted) state.
*/
addAttributeWithValue();
state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
break attributevaluedoublequotedloop;
// continue stateloop;
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in attribute value state, with the
* additional allowed character being U+0022
* QUOTATION MARK (").
*/
assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
appendCharRefBuf(c);
setAdditionalAndRememberAmpersandLocation('\"');
returnState = state;
state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append the current input
* character to the current attribute's value.
*/
appendStrBuf(c);
/*
* Stay in the attribute value (double-quoted)
* state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case AFTER_ATTRIBUTE_VALUE_QUOTED:
afterattributevaluequotedloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the before attribute name state.
*/
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
continue stateloop;
case '/':
/*
* U+002F SOLIDUS (/) Switch to the self-closing
* start tag state.
*/
state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
break afterattributevaluequotedloop;
// continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* tag token.
*/
state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
/*
* Switch to the data state.
*/
continue stateloop;
default:
/*
* Anything else Parse error.
*/
errNoSpaceBetweenAttributes();
/*
* Reconsume the character in the before
* attribute name state.
*/
reconsume = true;
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case SELF_CLOSING_START_TAG:
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Set the self-closing
* flag of the current tag token. Emit the current
* tag token.
*/
// [NOCPP[
errHtml4XmlVoidSyntax();
// ]NOCPP]
state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
/*
* Switch to the data state.
*/
continue stateloop;
default:
/* Anything else Parse error. */
errSlashNotFollowedByGt();
/*
* Reconsume the character in the before attribute
* name state.
*/
reconsume = true;
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
continue stateloop;
}
// XXX reorder point
case ATTRIBUTE_VALUE_UNQUOTED:
for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
addAttributeWithValue();
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the before attribute name state.
*/
addAttributeWithValue();
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
continue stateloop;
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in attribute value state, with the
* additional allowed character being U+003E
* GREATER-THAN SIGN (>)
*/
assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
appendCharRefBuf(c);
setAdditionalAndRememberAmpersandLocation('>');
returnState = state;
state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* tag token.
*/
addAttributeWithValue();
state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
/*
* Switch to the data state.
*/
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
case '<':
case '\"':
case '\'':
case '=':
case '`':
/*
* U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
* (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
* SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
*/
errUnquotedAttributeValOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
// fall through
default:
// [NOCPP]
errHtml4NonNameInUnquotedAttribute(c);
// ]NOCPP]
/*
* Anything else Append the current input
* character to the current attribute's value.
*/
appendStrBuf(c);
/*
* Stay in the attribute value (unquoted) state.
*/
continue;
}
}
// XXX reorder point
case AFTER_ATTRIBUTE_NAME:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the after attribute name state.
*/
continue;
case '/':
/*
* U+002F SOLIDUS (/) Switch to the self-closing
* start tag state.
*/
addAttributeWithoutValue();
state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
continue stateloop;
case '=':
/*
* U+003D EQUALS SIGN (=) Switch to the before
* attribute value state.
*/
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* tag token.
*/
addAttributeWithoutValue();
state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
/*
* Switch to the data state.
*/
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
case '\"':
case '\'':
case '<':
errQuoteOrLtInAttributeNameOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
default:
addAttributeWithoutValue();
/*
* Anything else Start a new attribute in the
* current tag token.
*/
if (c >= 'A' && c <= 'Z') {
/*
* U+0041 LATIN CAPITAL LETTER A through to
* U+005A LATIN CAPITAL LETTER Z Set that
* attribute's name to the lowercase version
* of the current input character (add
* 0x0020 to the character's code point)
*/
c += 0x20;
}
/*
* Set that attribute's name to the current
* input character,
*/
clearStrBufBeforeUse();
appendStrBuf(c);
/*
* and its value to the empty string.
*/
// Will do later.
/*
* Switch to the attribute name state.
*/
state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
continue stateloop;
}
}
// XXX reorder point
case MARKUP_DECLARATION_OPEN:
markupdeclarationopenloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* If the next two characters are both U+002D
* HYPHEN-MINUS characters (-), consume those two
* characters, create a comment token whose data is the
* empty string, and switch to the comment start state.
*
* Otherwise, if the next seven characters are an ASCII
* case-insensitive match for the word "DOCTYPE", then
* consume those characters and switch to the DOCTYPE
* state.
*
* Otherwise, if the insertion mode is
* "in foreign content" and the current node is not an
* element in the HTML namespace and the next seven
* characters are an case-sensitive match for the string
* "[CDATA[" (the five uppercase letters "CDATA" with a
* U+005B LEFT SQUARE BRACKET character before and
* after), then consume those characters and switch to
* the CDATA section state.
*
* Otherwise, is is a parse error. Switch to the bogus
* comment state. The next character that is consumed,
* if any, is the first character that will be in the
* comment.
*/
switch (c) {
case '-':
clearStrBufBeforeUse();
appendStrBuf(c);
state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
break markupdeclarationopenloop;
// continue stateloop;
case 'd':
case 'D':
clearStrBufBeforeUse();
appendStrBuf(c);
index = 0;
state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
continue stateloop;
case '[':
if (tokenHandler.cdataSectionAllowed()) {
clearStrBufBeforeUse();
appendStrBuf(c);
index = 0;
state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
continue stateloop;
}
// else fall through
default:
errBogusComment();
clearStrBufBeforeUse();
reconsume = true;
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case MARKUP_DECLARATION_HYPHEN:
markupdeclarationhyphenloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '\u0000':
break stateloop;
case '-':
clearStrBufAfterOneHyphen();
state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
break markupdeclarationhyphenloop;
// continue stateloop;
default:
errBogusComment();
reconsume = true;
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case COMMENT_START:
commentstartloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Comment start state
*
*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Switch to the comment
* start dash state.
*/
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Parse error.
*/
errPrematureEndOfComment();
/* Emit the comment token. */
emitComment(0, pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
break stateloop;
case '\n':
appendStrBufLineFeed();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
break commentstartloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append the input character to
* the comment token's data.
*/
appendStrBuf(c);
/*
* Switch to the comment state.
*/
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
break commentstartloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case COMMENT:
commentloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Comment state Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Switch to the comment
* end dash state
*/
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
break commentloop;
// continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append the input character to
* the comment token's data.
*/
appendStrBuf(c);
/*
* Stay in the comment state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case COMMENT_END_DASH:
commentenddashloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Comment end dash state Consume the next input
* character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Switch to the comment
* end state
*/
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
break commentenddashloop;
// continue stateloop;
case '\r':
appendStrBufCarriageReturn();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
break stateloop;
case '\n':
appendStrBufLineFeed();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append a U+002D HYPHEN-MINUS
* (-) character and the input character to the
* comment token's data.
*/
appendStrBuf(c);
/*
* Switch to the comment state.
*/
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case COMMENT_END:
commentendloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Comment end dash state Consume the next input
* character:
*/
switch (c) {
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the comment
* token.
*/
emitComment(2, pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '-':
/* U+002D HYPHEN-MINUS (-) Parse error. */
/*
* Append a U+002D HYPHEN-MINUS (-) character to
* the comment token's data.
*/
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
/*
* Stay in the comment end state.
*/
continue;
case '\r':
adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
break stateloop;
case '\n':
adjustDoubleHyphenAndAppendToStrBufLineFeed();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
case '!':
errHyphenHyphenBang();
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Append two U+002D HYPHEN-MINUS (-) characters
* and the input character to the comment
* token's data.
*/
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
/*
* Switch to the comment state.
*/
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
}
}
// XXX reorder point
case COMMENT_END_BANG:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Comment end bang state
*
* Consume the next input character:
*/
switch (c) {
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the comment
* token.
*/
emitComment(3, pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '-':
/*
* Append two U+002D HYPHEN-MINUS (-) characters
* and a U+0021 EXCLAMATION MARK (!) character
* to the comment token's data.
*/
appendStrBuf(c);
/*
* Switch to the comment end dash state.
*/
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append two U+002D HYPHEN-MINUS
* (-) characters, a U+0021 EXCLAMATION MARK (!)
* character, and the input character to the
* comment token's data. Switch to the comment
* state.
*/
appendStrBuf(c);
/*
* Switch to the comment state.
*/
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
}
}
// XXX reorder point
case COMMENT_START_DASH:
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Comment start dash state
*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Switch to the comment end
* state
*/
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
continue stateloop;
case '>':
errPrematureEndOfComment();
/* Emit the comment token. */
emitComment(1, pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
break stateloop;
case '\n':
appendStrBufLineFeed();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Append a U+002D HYPHEN-MINUS character (-) and
* the current input character to the comment
* token's data.
*/
appendStrBuf(c);
/*
* Switch to the comment state.
*/
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
}
// XXX reorder point
case CDATA_START:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
if (index < 6) { // CDATA_LSQB.length
if (c == Tokenizer.CDATA_LSQB[index]) {
appendStrBuf(c);
} else {
errBogusComment();
reconsume = true;
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
}
index++;
continue;
} else {
clearStrBufAfterUse();
cstart = pos; // start coalescing
reconsume = true;
state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
break; // FALL THROUGH continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case CDATA_SECTION:
cdatasectionloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
switch (c) {
case ']':
flushChars(buf, pos);
state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
break cdatasectionloop; // FALL THROUGH
case '\u0000':
emitReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
default:
continue;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case CDATA_RSQB:
cdatarsqb: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case ']':
state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
break cdatarsqb;
default:
tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
1);
cstart = pos;
reconsume = true;
state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case CDATA_RSQB_RSQB:
cdatarsqbrsqb: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case ']':
// Saw a third ]. Emit one ] (logically the
// first one) and stay in this state to
// remember that the last two characters seen
// have been ]].
tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
continue;
case '>':
cstart = pos + 1;
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
default:
tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
cstart = pos;
reconsume = true;
state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
continue stateloop;
}
}
// XXX reorder point
case ATTRIBUTE_VALUE_SINGLE_QUOTED:
attributevaluesinglequotedloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\'':
/*
* U+0027 APOSTROPHE (') Switch to the after
* attribute value (quoted) state.
*/
addAttributeWithValue();
state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
continue stateloop;
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in attribute value state, with the
* + additional allowed character being U+0027
* APOSTROPHE (').
*/
assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
appendCharRefBuf(c);
setAdditionalAndRememberAmpersandLocation('\'');
returnState = state;
state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
break attributevaluesinglequotedloop;
// continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append the current input
* character to the current attribute's value.
*/
appendStrBuf(c);
/*
* Stay in the attribute value (double-quoted)
* state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case CONSUME_CHARACTER_REFERENCE:
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
if (c == '\u0000') {
break stateloop;
}
/*
* Unlike the definition is the spec, this state does not
* return a value and never requires the caller to
* backtrack. This state takes care of emitting characters
* or appending to the current attribute value. It also
* takes care of that in the case when consuming the
* character reference fails.
*/
/*
* This section defines how to consume a character
* reference. This definition is used when parsing character
* references in text and in attributes.
*
* The behavior depends on the identity of the next
* character (the one immediately after the U+0026 AMPERSAND
* character):
*/
switch (c) {
case ' ':
case '\t':
case '\n':
case '\r': // we'll reconsume!
case '\u000C':
case '<':
case '&':
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
case '#':
/*
* U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
* SIGN.
*/
appendCharRefBuf('#');
state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
continue stateloop;
default:
if (c == additional) {
emitOrAppendCharRefBuf(returnState);
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
if (c >= 'a' && c <= 'z') {
firstCharKey = c - 'a' + 26;
} else if (c >= 'A' && c <= 'Z') {
firstCharKey = c - 'A';
} else {
// No match
/*
* If no match can be made, then this is a parse
* error.
*/
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
// Didn't fail yet
appendCharRefBuf(c);
state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
// FALL THROUGH continue stateloop;
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case CHARACTER_REFERENCE_HILO_LOOKUP:
{
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
if (c == '\u0000') {
break stateloop;
}
/*
* The data structure is as follows:
*
* HILO_ACCEL is a two-dimensional int array whose major
* index corresponds to the second character of the
* character reference (code point as index) and the
* minor index corresponds to the first character of the
* character reference (packed so that A-Z runs from 0
* to 25 and a-z runs from 26 to 51). This layout makes
* it easier to use the sparseness of the data structure
* to omit parts of it: The second dimension of the
* table is null when no character reference starts with
* the character corresponding to that row.
*
* The int value HILO_ACCEL (by these indeces) is zero
* if there exists no character reference starting with
* that two-letter prefix. Otherwise, the value is an
* int that packs two shorts so that the higher short is
* the index of the highest character reference name
* with that prefix in NAMES and the lower short
* corresponds to the index of the lowest character
* reference name with that prefix. (It happens that the
* first two character reference names share their
* prefix so the packed int cannot be 0 by packing the
* two shorts.)
*
* NAMES is an array of byte arrays where each byte
* array encodes the name of a character references as
* ASCII. The names omit the first two letters of the
* name. (Since storing the first two letters would be
* redundant with the data contained in HILO_ACCEL.) The
* entries are lexically sorted.
*
* For a given index in NAMES, the same index in VALUES
* contains the corresponding expansion as an array of
* two UTF-16 code units (either the character and
* U+0000 or a suggogate pair).
*/
int hilo = 0;
if (c <= 'z') {
@Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
if (row != null) {
hilo = row[firstCharKey];
}
}
if (hilo == 0) {
/*
* If no match can be made, then this is a parse
* error.
*/
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
// Didn't fail yet
appendCharRefBuf(c);
lo = hilo & 0xFFFF;
hi = hilo >> 16;
entCol = -1;
candidate = -1;
charRefBufMark = 0;
state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
// FALL THROUGH continue stateloop;
}
case CHARACTER_REFERENCE_TAIL:
outer: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
if (c == '\u0000') {
break stateloop;
}
entCol++;
/*
* Consume the maximum number of characters possible,
* with the consumed characters matching one of the
* identifiers in the first column of the named
* character references table (in a case-sensitive
* manner).
*/
loloop: for (;;) {
if (hi < lo) {
break outer;
}
if (entCol == NamedCharacters.NAMES[lo].length()) {
candidate = lo;
charRefBufMark = charRefBufLen;
lo++;
} else if (entCol > NamedCharacters.NAMES[lo].length()) {
break outer;
} else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
lo++;
} else {
break loloop;
}
}
hiloop: for (;;) {
if (hi < lo) {
break outer;
}
if (entCol == NamedCharacters.NAMES[hi].length()) {
break hiloop;
}
if (entCol > NamedCharacters.NAMES[hi].length()) {
break outer;
} else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
hi--;
} else {
break hiloop;
}
}
if (c == ';') {
// If we see a semicolon, there cannot be a
// longer match. Break the loop. However, before
// breaking, take the longest match so far as the
// candidate, if we are just about to complete a
// match.
if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
candidate = lo;
charRefBufMark = charRefBufLen;
}
break outer;
}
if (hi < lo) {
break outer;
}
appendCharRefBuf(c);
continue;
}
if (candidate == -1) {
// reconsume deals with CR, LF or nul
/*
* If no match can be made, then this is a parse error.
*/
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
} else {
// c can't be CR, LF or nul if we got here
@Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
if (candidateName.length() == 0
|| candidateName.charAt(candidateName.length() - 1) != ';') {
/*
* If the last character matched is not a U+003B
* SEMICOLON (;), there is a parse error.
*/
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
/*
* If the entity is being consumed as part of an
* attribute, and the last character matched is
* not a U+003B SEMICOLON (;),
*/
char ch;
if (charRefBufMark == charRefBufLen) {
ch = c;
} else {
ch = charRefBuf[charRefBufMark];
}
if (ch == '=' || (ch >= '0' && ch <= '9')
|| (ch >= 'A' && ch <= 'Z')
|| (ch >= 'a' && ch <= 'z')) {
/*
* and the next character is either a U+003D
* EQUALS SIGN character (=) or in the range
* U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
* U+0041 LATIN CAPITAL LETTER A to U+005A
* LATIN CAPITAL LETTER Z, or U+0061 LATIN
* SMALL LETTER A to U+007A LATIN SMALL
* LETTER Z, then, for historical reasons,
* all the characters that were matched
* after the U+0026 AMPERSAND (&) must be
* unconsumed, and nothing is returned.
*/
errNoNamedCharacterMatch();
appendCharRefBufToStrBuf();
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
}
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
errUnescapedAmpersandInterpretedAsCharacterReference();
} else {
errNotSemicolonTerminated();
}
}
/*
* Otherwise, return a character token for the character
* corresponding to the entity name (as given by the
* second column of the named character references
* table).
*/
// CPPONLY: completedNamedCharacterReference();
@Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
if (
// [NOCPP[
val.length == 1
// ]NOCPP]
// CPPONLY: val[1] == 0
) {
emitOrAppendOne(val, returnState);
} else {
emitOrAppendTwo(val, returnState);
}
// this is so complicated!
if (charRefBufMark < charRefBufLen) {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendStrBuf(charRefBuf, charRefBufMark,
charRefBufLen - charRefBufMark);
} else {
tokenHandler.characters(charRefBuf, charRefBufMark,
charRefBufLen - charRefBufMark);
}
}
// charRefBufLen will be zeroed below!
// Check if we broke out early with c being the last
// character that matched as opposed to being the
// first one that didn't match. In the case of an
// early break, the next run on text should start
// *after* the current character and the current
// character shouldn't be reconsumed.
boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
charRefBufLen = 0;
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = earlyBreak ? pos + 1 : pos;
}
reconsume = !earlyBreak;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
/*
* If the markup contains I'm &notit; I tell you, the
* entity is parsed as "not", as in, I'm ¬it; I tell
* you. But if the markup was I'm &notin; I tell you,
* the entity would be parsed as "notin;", resulting in
* I'm ∉ I tell you.
*/
}
// XXX reorder point
case CONSUME_NCR:
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
value = 0;
seenDigits = false;
/*
* The behavior further depends on the character after the
* U+0023 NUMBER SIGN:
*/
switch (c) {
case 'x':
case 'X':
/*
* U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
* LETTER X Consume the X.
*
* Follow the steps below, but using the range of
* characters U+0030 DIGIT ZERO through to U+0039
* DIGIT NINE, U+0061 LATIN SMALL LETTER A through
* to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
* CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
* LETTER F (in other words, 0-9, A-F, a-f).
*
* When it comes to interpreting the number,
* interpret it as a hexadecimal number.
*/
appendCharRefBuf(c);
state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
continue stateloop;
default:
/*
* Anything else Follow the steps below, but using
* the range of characters U+0030 DIGIT ZERO through
* to U+0039 DIGIT NINE (i.e. just 0-9).
*
* When it comes to interpreting the number,
* interpret it as a decimal number.
*/
reconsume = true;
state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
// FALL THROUGH continue stateloop;
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case DECIMAL_NRC_LOOP:
decimalloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume as many characters as match the range of
* characters given above.
*/
assert value >= 0: "value must not become negative.";
if (c >= '0' && c <= '9') {
seenDigits = true;
// Avoid overflow
if (value <= 0x10FFFF) {
value *= 10;
value += c - '0';
}
continue;
} else if (c == ';') {
if (seenDigits) {
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos + 1;
}
state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
// FALL THROUGH continue stateloop;
break decimalloop;
} else {
errNoDigitsInNCR();
appendCharRefBuf(';');
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos + 1;
}
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
} else {
/*
* If no characters match the range, then don't
* consume any characters (and unconsume the U+0023
* NUMBER SIGN character and, if appropriate, the X
* character). This is a parse error; nothing is
* returned.
*
* Otherwise, if the next character is a U+003B
* SEMICOLON, consume that too. If it isn't, there
* is a parse error.
*/
if (!seenDigits) {
errNoDigitsInNCR();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
} else {
errCharRefLacksSemicolon();
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
// FALL THROUGH continue stateloop;
break decimalloop;
}
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case HANDLE_NCR_VALUE:
// WARNING previous state sets reconsume
// We are not going to emit the contents of charRefBuf.
charRefBufLen = 0;
// XXX inline this case if the method size can take it
handleNcrValue(returnState);
state = transition(state, returnState, reconsume, pos);
continue stateloop;
// XXX reorder point
case HEX_NCR_LOOP:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume as many characters as match the range of
* characters given above.
*/
assert value >= 0: "value must not become negative.";
if (c >= '0' && c <= '9') {
seenDigits = true;
// Avoid overflow
if (value <= 0x10FFFF) {
value *= 16;
value += c - '0';
}
continue;
} else if (c >= 'A' && c <= 'F') {
seenDigits = true;
// Avoid overflow
if (value <= 0x10FFFF) {
value *= 16;
value += c - 'A' + 10;
}
continue;
} else if (c >= 'a' && c <= 'f') {
seenDigits = true;
// Avoid overflow
if (value <= 0x10FFFF) {
value *= 16;
value += c - 'a' + 10;
}
continue;
} else if (c == ';') {
if (seenDigits) {
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos + 1;
}
state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
continue stateloop;
} else {
errNoDigitsInNCR();
appendCharRefBuf(';');
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos + 1;
}
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
} else {
/*
* If no characters match the range, then don't
* consume any characters (and unconsume the U+0023
* NUMBER SIGN character and, if appropriate, the X
* character). This is a parse error; nothing is
* returned.
*
* Otherwise, if the next character is a U+003B
* SEMICOLON, consume that too. If it isn't, there
* is a parse error.
*/
if (!seenDigits) {
errNoDigitsInNCR();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
} else {
errCharRefLacksSemicolon();
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
continue stateloop;
}
}
}
// XXX reorder point
case PLAINTEXT:
plaintextloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
switch (c) {
case '\u0000':
emitPlaintextReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the current input
* character as a character token. Stay in the
* RAWTEXT state.
*/
continue;
}
}
// XXX reorder point
case CLOSE_TAG_OPEN:
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Otherwise, if the content model flag is set to the PCDATA
* state, or if the next few characters do match that tag
* name, consume the next input character:
*/
switch (c) {
case '>':
/* U+003E GREATER-THAN SIGN (>) Parse error. */
errLtSlashGt();
/*
* Switch to the data state.
*/
cstart = pos + 1;
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\r':
silentCarriageReturn();
/* Anything else Parse error. */
errGarbageAfterLtSlash();
/*
* Switch to the bogus comment state.
*/
clearStrBufBeforeUse();
appendStrBuf('\n');
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
/* Anything else Parse error. */
errGarbageAfterLtSlash();
/*
* Switch to the bogus comment state.
*/
clearStrBufBeforeUse();
appendStrBuf(c);
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
if (c >= 'A' && c <= 'Z') {
c += 0x20;
}
if (c >= 'a' && c <= 'z') {
/*
* U+0061 LATIN SMALL LETTER A through to U+007A
* LATIN SMALL LETTER Z Create a new end tag
* token,
*/
endTag = true;
/*
* set its tag name to the input character,
*/
clearStrBufBeforeUse();
appendStrBuf(c);
/*
* then switch to the tag name state. (Don't
* emit the token yet; further details will be
* filled in before it is emitted.)
*/
state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
continue stateloop;
} else {
/* Anything else Parse error. */
errGarbageAfterLtSlash();
/*
* Switch to the bogus comment state.
*/
clearStrBufBeforeUse();
appendStrBuf(c);
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
}
}
// XXX reorder point
case RCDATA:
rcdataloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
switch (c) {
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in RCDATA state.
*/
flushChars(buf, pos);
assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
appendCharRefBuf(c);
setAdditionalAndRememberAmpersandLocation('\u0000');
returnState = state;
state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
continue stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the
* RCDATA less-than sign state.
*/
flushChars(buf, pos);
returnState = state;
state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
continue stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Emit the current input character as a
* character token. Stay in the RCDATA state.
*/
continue;
}
}
// XXX reorder point
case RAWTEXT:
rawtextloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
switch (c) {
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the
* RAWTEXT less-than sign state.
*/
flushChars(buf, pos);
returnState = state;
state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
break rawtextloop;
// FALL THRU continue stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Emit the current input character as a
* character token. Stay in the RAWTEXT state.
*/
continue;
}
}
// XXX fallthru don't reorder
case RAWTEXT_RCDATA_LESS_THAN_SIGN:
rawtextrcdatalessthansignloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '/':
/*
* U+002F SOLIDUS (/) Set the temporary buffer
* to the empty string. Switch to the script
* data end tag open state.
*/
index = 0;
clearStrBufBeforeUse();
state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
break rawtextrcdatalessthansignloop;
// FALL THRU continue stateloop;
default:
/*
* Otherwise, emit a U+003C LESS-THAN SIGN
* character token
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
/*
* and reconsume the current input character in
* the data state.
*/
cstart = pos;
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
}
// XXX fall thru. don't reorder.
case NON_DATA_END_TAG_NAME:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* ASSERT! when entering this state, set index to 0 and
* call clearStrBufBeforeUse() assert (contentModelElement !=
* null); Let's implement the above without lookahead.
* strBuf is the 'temporary buffer'.
*/
if (index < endTagExpectationAsArray.length) {
char e = endTagExpectationAsArray[index];
char folded = c;
if (c >= 'A' && c <= 'Z') {
folded += 0x20;
}
if (folded != e) {
// [NOCPP[
errHtml4LtSlashInRcdata(folded);
// ]NOCPP]
tokenHandler.characters(Tokenizer.LT_SOLIDUS,
0, 2);
emitStrBuf();
cstart = pos;
reconsume = true;
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
appendStrBuf(c);
index++;
continue;
} else {
endTag = true;
// XXX replace contentModelElement with different
// type
tagName = endTagExpectation;
switch (c) {
case '\r':
silentCarriageReturn();
clearStrBufAfterUse(); // strBuf not used
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE
* FEED (LF) U+000C FORM FEED (FF) U+0020
* SPACE If the current end tag token is an
* appropriate end tag token, then switch to
* the before attribute name state.
*/
clearStrBufAfterUse(); // strBuf not used
state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
continue stateloop;
case '/':
/*
* U+002F SOLIDUS (/) If the current end tag
* token is an appropriate end tag token,
* then switch to the self-closing start tag
* state.
*/
clearStrBufAfterUse(); // strBuf not used
state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) If the
* current end tag token is an appropriate
* end tag token, then emit the current tag
* token and switch to the data state.
*/
clearStrBufAfterUse(); // strBuf not used
state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
if (shouldSuspend) {
break stateloop;
}
continue stateloop;
default:
/*
* Emit a U+003C LESS-THAN SIGN character
* token, a U+002F SOLIDUS character token,
* a character token for each of the
* characters in the temporary buffer (in
* the order they were added to the buffer),
* and reconsume the current input character
* in the RAWTEXT state.
*/
// [NOCPP[
errWarnLtSlashInRcdata();
// ]NOCPP]
tokenHandler.characters(
Tokenizer.LT_SOLIDUS, 0, 2);
emitStrBuf();
if (c == '\u0000') {
emitReplacementCharacter(buf, pos);
} else {
cstart = pos; // don't drop the
// character
}
state = transition(state, returnState, reconsume, pos);
continue stateloop;
}
}
}
// XXX reorder point
// BEGIN HOTSPOT WORKAROUND
case BOGUS_COMMENT:
boguscommentloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume every character up to and including the first
* U+003E GREATER-THAN SIGN character (>) or the end of
* the file (EOF), whichever comes first. Emit a comment
* token whose data is the concatenation of all the
* characters starting from and including the character
* that caused the state machine to switch into the
* bogus comment state, up to and including the
* character immediately before the last consumed
* character (i.e. up to the character just before the
* U+003E or EOF character). (If the comment was started
* by the end of the file (EOF), the token is empty.)
*
* Switch to the data state.
*
* If the end of the file was reached, reconsume the EOF
* character.
*/
switch (c) {
case '>':
emitComment(0, pos);
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '-':
appendStrBuf(c);
state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
break boguscommentloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
appendStrBuf(c);
continue;
}
}
// FALLTHRU DON'T REORDER
case BOGUS_COMMENT_HYPHEN:
boguscommenthyphenloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '>':
// [NOCPP[
maybeAppendSpaceToBogusComment();
// ]NOCPP]
emitComment(0, pos);
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '-':
appendSecondHyphenToBogusComment();
continue boguscommenthyphenloop;
case '\r':
appendStrBufCarriageReturn();
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
break stateloop;
case '\n':
appendStrBufLineFeed();
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
appendStrBuf(c);
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
}
}
// XXX reorder point
case SCRIPT_DATA:
scriptdataloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
switch (c) {
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the
* script data less-than sign state.
*/
flushChars(buf, pos);
returnState = state;
state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
break scriptdataloop; // FALL THRU continue
// stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the current input
* character as a character token. Stay in the
* script data state.
*/
continue;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_LESS_THAN_SIGN:
scriptdatalessthansignloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '/':
/*
* U+002F SOLIDUS (/) Set the temporary buffer
* to the empty string. Switch to the script
* data end tag open state.
*/
index = 0;
clearStrBufBeforeUse();
state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
continue stateloop;
case '!':
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
cstart = pos;
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
break scriptdatalessthansignloop; // FALL THRU
// continue
// stateloop;
default:
/*
* Otherwise, emit a U+003C LESS-THAN SIGN
* character token
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
/*
* and reconsume the current input character in
* the data state.
*/
cstart = pos;
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_ESCAPE_START:
scriptdataescapestartloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Emit a U+002D
* HYPHEN-MINUS character token. Switch to the
* script data escape start dash state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
break scriptdataescapestartloop; // FALL THRU
// continue
// stateloop;
default:
/*
* Anything else Reconsume the current input
* character in the script data state.
*/
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_ESCAPE_START_DASH:
scriptdataescapestartdashloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Emit a U+002D
* HYPHEN-MINUS character token. Switch to the
* script data escaped dash dash state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
break scriptdataescapestartdashloop;
// continue stateloop;
default:
/*
* Anything else Reconsume the current input
* character in the script data state.
*/
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_ESCAPED_DASH_DASH:
scriptdataescapeddashdashloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Emit a U+002D
* HYPHEN-MINUS character token. Stay in the
* script data escaped dash dash state.
*/
continue;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the
* script data escaped less-than sign state.
*/
flushChars(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit a U+003E
* GREATER-THAN SIGN character token. Switch to
* the script data state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
continue stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
break scriptdataescapeddashdashloop;
case '\r':
emitCarriageReturn(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the current input
* character as a character token. Switch to the
* script data escaped state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
break scriptdataescapeddashdashloop;
// continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_ESCAPED:
scriptdataescapedloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Emit a U+002D
* HYPHEN-MINUS character token. Switch to the
* script data escaped dash state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
break scriptdataescapedloop; // FALL THRU
// continue
// stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the
* script data escaped less-than sign state.
*/
flushChars(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
continue stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the current input
* character as a character token. Stay in the
* script data escaped state.
*/
continue;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_ESCAPED_DASH:
scriptdataescapeddashloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Emit a U+002D
* HYPHEN-MINUS character token. Switch to the
* script data escaped dash dash state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
continue stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the
* script data escaped less-than sign state.
*/
flushChars(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
break scriptdataescapeddashloop;
// continue stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
continue stateloop;
case '\r':
emitCarriageReturn(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the current input
* character as a character token. Switch to the
* script data escaped state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
scriptdataescapedlessthanloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '/':
/*
* U+002F SOLIDUS (/) Set the temporary buffer
* to the empty string. Switch to the script
* data escaped end tag open state.
*/
index = 0;
clearStrBufBeforeUse();
returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
continue stateloop;
case 'S':
case 's':
/*
* U+0041 LATIN CAPITAL LETTER A through to
* U+005A LATIN CAPITAL LETTER Z Emit a U+003C
* LESS-THAN SIGN character token and the
* current input character as a character token.
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
cstart = pos;
index = 1;
/*
* Set the temporary buffer to the empty string.
* Append the lowercase version of the current
* input character (add 0x0020 to the
* character's code point) to the temporary
* buffer. Switch to the script data double
* escape start state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
break scriptdataescapedlessthanloop;
// continue stateloop;
default:
/*
* Anything else Emit a U+003C LESS-THAN SIGN
* character token and reconsume the current
* input character in the script data escaped
* state.
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
cstart = pos;
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_DOUBLE_ESCAPE_START:
scriptdatadoubleescapestartloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
assert index > 0;
if (index < 6) { // SCRIPT_ARR.length
char folded = c;
if (c >= 'A' && c <= 'Z') {
folded += 0x20;
}
if (folded != Tokenizer.SCRIPT_ARR[index]) {
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
continue stateloop;
}
index++;
continue;
}
switch (c) {
case '\r':
emitCarriageReturn(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
case ' ':
case '\t':
case '\u000C':
case '/':
case '>':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
* (>) Emit the current input character as a
* character token. If the temporary buffer is
* the string "script", then switch to the
* script data double escaped state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
break scriptdatadoubleescapestartloop;
// continue stateloop;
default:
/*
* Anything else Reconsume the current input
* character in the script data escaped state.
*/
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_DOUBLE_ESCAPED:
scriptdatadoubleescapedloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Emit a U+002D
* HYPHEN-MINUS character token. Switch to the
* script data double escaped dash state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
break scriptdatadoubleescapedloop; // FALL THRU
// continue
// stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Emit a U+003C
* LESS-THAN SIGN character token. Switch to the
* script data double escaped less-than sign
* state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
continue stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the current input
* character as a character token. Stay in the
* script data double escaped state.
*/
continue;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
scriptdatadoubleescapeddashloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Emit a U+002D
* HYPHEN-MINUS character token. Switch to the
* script data double escaped dash dash state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
break scriptdatadoubleescapeddashloop;
// continue stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Emit a U+003C
* LESS-THAN SIGN character token. Switch to the
* script data double escaped less-than sign
* state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
continue stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
continue stateloop;
case '\r':
emitCarriageReturn(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the current input
* character as a character token. Switch to the
* script data double escaped state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
scriptdatadoubleescapeddashdashloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '-':
/*
* U+002D HYPHEN-MINUS (-) Emit a U+002D
* HYPHEN-MINUS character token. Stay in the
* script data double escaped dash dash state.
*/
continue;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Emit a U+003C
* LESS-THAN SIGN character token. Switch to the
* script data double escaped less-than sign
* state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
break scriptdatadoubleescapeddashdashloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit a U+003E
* GREATER-THAN SIGN character token. Switch to
* the script data state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
continue stateloop;
case '\u0000':
emitReplacementCharacter(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
continue stateloop;
case '\r':
emitCarriageReturn(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
default:
/*
* Anything else Emit the current input
* character as a character token. Switch to the
* script data double escaped state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
scriptdatadoubleescapedlessthanloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '/':
/*
* U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
* character token. Set the temporary buffer to
* the empty string. Switch to the script data
* double escape end state.
*/
index = 0;
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
break scriptdatadoubleescapedlessthanloop;
default:
/*
* Anything else Reconsume the current input
* character in the script data double escaped
* state.
*/
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
continue stateloop;
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case SCRIPT_DATA_DOUBLE_ESCAPE_END:
scriptdatadoubleescapeendloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
if (index < 6) { // SCRIPT_ARR.length
char folded = c;
if (c >= 'A' && c <= 'Z') {
folded += 0x20;
}
if (folded != Tokenizer.SCRIPT_ARR[index]) {
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
continue stateloop;
}
index++;
continue;
}
switch (c) {
case '\r':
emitCarriageReturn(buf, pos);
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
case ' ':
case '\t':
case '\u000C':
case '/':
case '>':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
* (>) Emit the current input character as a
* character token. If the temporary buffer is
* the string "script", then switch to the
* script data escaped state.
*/
state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
continue stateloop;
default:
/*
* Reconsume the current input character in the
* script data double escaped state.
*/
reconsume = true;
state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
continue stateloop;
}
}
// XXX reorder point
case MARKUP_DECLARATION_OCTYPE:
markupdeclarationdoctypeloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
if (index < 6) { // OCTYPE.length
char folded = c;
if (c >= 'A' && c <= 'Z') {
folded += 0x20;
}
if (folded == Tokenizer.OCTYPE[index]) {
appendStrBuf(c);
} else {
errBogusComment();
reconsume = true;
state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
continue stateloop;
}
index++;
continue;
} else {
reconsume = true;
state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
break markupdeclarationdoctypeloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case DOCTYPE:
doctypeloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
initDoctypeFields();
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the before DOCTYPE name state.
*/
state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
break doctypeloop;
// continue stateloop;
default:
/*
* Anything else Parse error.
*/
errMissingSpaceBeforeDoctypeName();
/*
* Reconsume the current character in the before
* DOCTYPE name state.
*/
reconsume = true;
state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
break doctypeloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case BEFORE_DOCTYPE_NAME:
beforedoctypenameloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the before DOCTYPE name state.
*/
continue;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Parse error.
*/
errNamelessDoctype();
/*
* Create a new DOCTYPE token. Set its
* force-quirks flag to on.
*/
forceQuirks = true;
/*
* Emit the token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
if (c >= 'A' && c <= 'Z') {
/*
* U+0041 LATIN CAPITAL LETTER A through to
* U+005A LATIN CAPITAL LETTER Z Create a
* new DOCTYPE token. Set the token's name
* to the lowercase version of the input
* character (add 0x0020 to the character's
* code point).
*/
c += 0x20;
}
/* Anything else Create a new DOCTYPE token. */
/*
* Set the token's name name to the current
* input character.
*/
clearStrBufBeforeUse();
appendStrBuf(c);
/*
* Switch to the DOCTYPE name state.
*/
state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
break beforedoctypenameloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case DOCTYPE_NAME:
doctypenameloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
strBufToDoctypeName();
state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the after DOCTYPE name state.
*/
strBufToDoctypeName();
state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
break doctypenameloop;
// continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* DOCTYPE token.
*/
strBufToDoctypeName();
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* U+0041 LATIN CAPITAL LETTER A through to
* U+005A LATIN CAPITAL LETTER Z Append the
* lowercase version of the input character (add
* 0x0020 to the character's code point) to the
* current DOCTYPE token's name.
*/
if (c >= 'A' && c <= 'Z') {
c += 0x0020;
}
/*
* Anything else Append the current input
* character to the current DOCTYPE token's
* name.
*/
appendStrBuf(c);
/*
* Stay in the DOCTYPE name state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case AFTER_DOCTYPE_NAME:
afterdoctypenameloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the after DOCTYPE name state.
*/
continue;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case 'p':
case 'P':
index = 0;
state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
break afterdoctypenameloop;
// continue stateloop;
case 's':
case 'S':
index = 0;
state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
continue stateloop;
default:
/*
* Otherwise, this is the parse error.
*/
bogusDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
// done by bogusDoctype();
/*
* Switch to the bogus DOCTYPE state.
*/
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case DOCTYPE_UBLIC:
doctypeublicloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* If the six characters starting from the current input
* character are an ASCII case-insensitive match for the
* word "PUBLIC", then consume those characters and
* switch to the before DOCTYPE public identifier state.
*/
if (index < 5) { // UBLIC.length
char folded = c;
if (c >= 'A' && c <= 'Z') {
folded += 0x20;
}
if (folded != Tokenizer.UBLIC[index]) {
bogusDoctype();
// forceQuirks = true;
reconsume = true;
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
index++;
continue;
} else {
reconsume = true;
state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
break doctypeublicloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case AFTER_DOCTYPE_PUBLIC_KEYWORD:
afterdoctypepublickeywordloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the before DOCTYPE public
* identifier state.
*/
state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
break afterdoctypepublickeywordloop;
// FALL THROUGH continue stateloop
case '"':
/*
* U+0022 QUOTATION MARK (") Parse Error.
*/
errNoSpaceBetweenDoctypePublicKeywordAndQuote();
/*
* Set the DOCTYPE token's public identifier to
* the empty string (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE public identifier
* (double-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
continue stateloop;
case '\'':
/*
* U+0027 APOSTROPHE (') Parse Error.
*/
errNoSpaceBetweenDoctypePublicKeywordAndQuote();
/*
* Set the DOCTYPE token's public identifier to
* the empty string (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE public identifier
* (single-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
continue stateloop;
case '>':
/* U+003E GREATER-THAN SIGN (>) Parse error. */
errExpectedPublicId();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
default:
bogusDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
// done by bogusDoctype();
/*
* Switch to the bogus DOCTYPE state.
*/
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
beforedoctypepublicidentifierloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the before DOCTYPE public identifier
* state.
*/
continue;
case '"':
/*
* U+0022 QUOTATION MARK (") Set the DOCTYPE
* token's public identifier to the empty string
* (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE public identifier
* (double-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
break beforedoctypepublicidentifierloop;
// continue stateloop;
case '\'':
/*
* U+0027 APOSTROPHE (') Set the DOCTYPE token's
* public identifier to the empty string (not
* missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE public identifier
* (single-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
continue stateloop;
case '>':
/* U+003E GREATER-THAN SIGN (>) Parse error. */
errExpectedPublicId();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
default:
bogusDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
// done by bogusDoctype();
/*
* Switch to the bogus DOCTYPE state.
*/
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
doctypepublicidentifierdoublequotedloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '"':
/*
* U+0022 QUOTATION MARK (") Switch to the after
* DOCTYPE public identifier state.
*/
publicIdentifier = strBufToString();
state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
break doctypepublicidentifierdoublequotedloop;
// continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Parse error.
*/
errGtInPublicId();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
publicIdentifier = strBufToString();
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append the current input
* character to the current DOCTYPE token's
* public identifier.
*/
appendStrBuf(c);
/*
* Stay in the DOCTYPE public identifier
* (double-quoted) state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
afterdoctypepublicidentifierloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the between DOCTYPE public and
* system identifiers state.
*/
state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
break afterdoctypepublicidentifierloop;
// continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '"':
/*
* U+0022 QUOTATION MARK (") Parse error.
*/
errNoSpaceBetweenPublicAndSystemIds();
/*
* Set the DOCTYPE token's system identifier to
* the empty string (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE system identifier
* (double-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
continue stateloop;
case '\'':
/*
* U+0027 APOSTROPHE (') Parse error.
*/
errNoSpaceBetweenPublicAndSystemIds();
/*
* Set the DOCTYPE token's system identifier to
* the empty string (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE system identifier
* (single-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
continue stateloop;
default:
bogusDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
// done by bogusDoctype();
/*
* Switch to the bogus DOCTYPE state.
*/
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
betweendoctypepublicandsystemidentifiersloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the between DOCTYPE public and system
* identifiers state.
*/
continue;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '"':
/*
* U+0022 QUOTATION MARK (") Set the DOCTYPE
* token's system identifier to the empty string
* (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE system identifier
* (double-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
break betweendoctypepublicandsystemidentifiersloop;
// continue stateloop;
case '\'':
/*
* U+0027 APOSTROPHE (') Set the DOCTYPE token's
* system identifier to the empty string (not
* missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE system identifier
* (single-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
continue stateloop;
default:
bogusDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
// done by bogusDoctype();
/*
* Switch to the bogus DOCTYPE state.
*/
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
doctypesystemidentifierdoublequotedloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '"':
/*
* U+0022 QUOTATION MARK (") Switch to the after
* DOCTYPE system identifier state.
*/
systemIdentifier = strBufToString();
state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Parse error.
*/
errGtInSystemId();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
systemIdentifier = strBufToString();
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append the current input
* character to the current DOCTYPE token's
* system identifier.
*/
appendStrBuf(c);
/*
* Stay in the DOCTYPE system identifier
* (double-quoted) state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
afterdoctypesystemidentifierloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the after DOCTYPE system identifier state.
*/
continue;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
default:
/*
* Switch to the bogus DOCTYPE state. (This does
* not set the DOCTYPE token's force-quirks flag
* to on.)
*/
bogusDoctypeWithoutQuirks();
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
break afterdoctypesystemidentifierloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case BOGUS_DOCTYPE:
for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit that
* DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
default:
/*
* Anything else Stay in the bogus DOCTYPE
* state.
*/
continue;
}
}
// XXX reorder point
case DOCTYPE_YSTEM:
doctypeystemloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Otherwise, if the six characters starting from the
* current input character are an ASCII case-insensitive
* match for the word "SYSTEM", then consume those
* characters and switch to the before DOCTYPE system
* identifier state.
*/
if (index < 5) { // YSTEM.length
char folded = c;
if (c >= 'A' && c <= 'Z') {
folded += 0x20;
}
if (folded != Tokenizer.YSTEM[index]) {
bogusDoctype();
reconsume = true;
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
index++;
continue stateloop;
} else {
reconsume = true;
state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
break doctypeystemloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case AFTER_DOCTYPE_SYSTEM_KEYWORD:
afterdoctypesystemkeywordloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE
* Switch to the before DOCTYPE public
* identifier state.
*/
state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
break afterdoctypesystemkeywordloop;
// FALL THROUGH continue stateloop
case '"':
/*
* U+0022 QUOTATION MARK (") Parse Error.
*/
errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
/*
* Set the DOCTYPE token's system identifier to
* the empty string (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE public identifier
* (double-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
continue stateloop;
case '\'':
/*
* U+0027 APOSTROPHE (') Parse Error.
*/
errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
/*
* Set the DOCTYPE token's public identifier to
* the empty string (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE public identifier
* (single-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
continue stateloop;
case '>':
/* U+003E GREATER-THAN SIGN (>) Parse error. */
errExpectedPublicId();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
default:
bogusDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
// done by bogusDoctype();
/*
* Switch to the bogus DOCTYPE state.
*/
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
beforedoctypesystemidentifierloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\r':
silentCarriageReturn();
break stateloop;
case '\n':
silentLineFeed();
// fall thru
case ' ':
case '\t':
case '\u000C':
/*
* U+0009 CHARACTER TABULATION U+000A LINE FEED
* (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
* in the before DOCTYPE system identifier
* state.
*/
continue;
case '"':
/*
* U+0022 QUOTATION MARK (") Set the DOCTYPE
* token's system identifier to the empty string
* (not missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE system identifier
* (double-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
continue stateloop;
case '\'':
/*
* U+0027 APOSTROPHE (') Set the DOCTYPE token's
* system identifier to the empty string (not
* missing),
*/
clearStrBufBeforeUse();
/*
* then switch to the DOCTYPE system identifier
* (single-quoted) state.
*/
state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
break beforedoctypesystemidentifierloop;
// continue stateloop;
case '>':
/* U+003E GREATER-THAN SIGN (>) Parse error. */
errExpectedSystemId();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
default:
bogusDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
// done by bogusDoctype();
/*
* Switch to the bogus DOCTYPE state.
*/
state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\'':
/*
* U+0027 APOSTROPHE (') Switch to the after
* DOCTYPE system identifier state.
*/
systemIdentifier = strBufToString();
state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
continue stateloop;
case '>':
errGtInSystemId();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
systemIdentifier = strBufToString();
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append the current input
* character to the current DOCTYPE token's
* system identifier.
*/
appendStrBuf(c);
/*
* Stay in the DOCTYPE system identifier
* (double-quoted) state.
*/
continue;
}
}
// XXX reorder point
case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\'':
/*
* U+0027 APOSTROPHE (') Switch to the after
* DOCTYPE public identifier state.
*/
publicIdentifier = strBufToString();
state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
continue stateloop;
case '>':
errGtInPublicId();
/*
* Set the DOCTYPE token's force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
publicIdentifier = strBufToString();
emitDoctypeToken(pos);
/*
* Switch to the data state.
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
/*
* Anything else Append the current input
* character to the current DOCTYPE token's
* public identifier.
*/
appendStrBuf(c);
/*
* Stay in the DOCTYPE public identifier
* (single-quoted) state.
*/
continue;
}
}
// XXX reorder point
case PROCESSING_INSTRUCTION:
processinginstructionloop: for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '?':
state = transition(
state,
Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
reconsume, pos);
break processinginstructionloop;
// continue stateloop;
default:
continue;
}
}
case PROCESSING_INSTRUCTION_QUESTION_MARK:
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '>':
state = transition(state, Tokenizer.DATA,
reconsume, pos);
continue stateloop;
default:
state = transition(state,
Tokenizer.PROCESSING_INSTRUCTION,
reconsume, pos);
continue stateloop;
}
// END HOTSPOT WORKAROUND
}
}
flushChars(buf, pos);
/*
* if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
*/
// Save locals
stateSave = state;
returnStateSave = returnState;
return pos;
}
// HOTSPOT WORKAROUND INSERTION POINT
// [NOCPP[
protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
return to;
}
// ]NOCPP]
private void initDoctypeFields() {
// Discard the characters "DOCTYPE" accumulated as a potential bogus
// comment into strBuf.
clearStrBufAfterUse();
doctypeName = "";
if (systemIdentifier != null) {
Portability.releaseString(systemIdentifier);
systemIdentifier = null;
}
if (publicIdentifier != null) {
Portability.releaseString(publicIdentifier);
publicIdentifier = null;
}
forceQuirks = false;
}
@Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
throws SAXException {
silentCarriageReturn();
adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
}
@Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
throws SAXException {
silentLineFeed();
adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
}
@Inline private void appendStrBufLineFeed() {
silentLineFeed();
appendStrBuf('\n');
}
@Inline private void appendStrBufCarriageReturn() {
silentCarriageReturn();
appendStrBuf('\n');
}
@Inline protected void silentCarriageReturn() {
++line;
lastCR = true;
}
@Inline protected void silentLineFeed() {
++line;
}
private void emitCarriageReturn(@NoLength char[] buf, int pos)
throws SAXException {
silentCarriageReturn();
flushChars(buf, pos);
tokenHandler.characters(Tokenizer.LF, 0, 1);
cstart = Integer.MAX_VALUE;
}
private void emitReplacementCharacter(@NoLength char[] buf, int pos)
throws SAXException {
flushChars(buf, pos);
tokenHandler.zeroOriginatingReplacementCharacter();
cstart = pos + 1;
}
private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
throws SAXException {
flushChars(buf, pos);
tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
cstart = pos + 1;
}
private void setAdditionalAndRememberAmpersandLocation(char add) {
additional = add;
// [NOCPP[
ampersandLocation = new LocatorImpl(this);
// ]NOCPP]
}
private void bogusDoctype() throws SAXException {
errBogusDoctype();
forceQuirks = true;
}
private void bogusDoctypeWithoutQuirks() throws SAXException {
errBogusDoctype();
forceQuirks = false;
}
private void handleNcrValue(int returnState) throws SAXException {
/*
* If one or more characters match the range, then take them all and
* interpret the string of characters as a number (either hexadecimal or
* decimal as appropriate).
*/
if (value <= 0xFFFF) {
if (value >= 0x80 && value <= 0x9f) {
/*
* If that number is one of the numbers in the first column of
* the following table, then this is a parse error.
*/
errNcrInC1Range();
/*
* Find the row with that number in the first column, and return
* a character token for the Unicode character given in the
* second column of that row.
*/
@NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
emitOrAppendOne(val, returnState);
// [NOCPP[
} else if (value == 0xC
&& contentSpacePolicy != XmlViolationPolicy.ALLOW) {
if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
emitOrAppendOne(Tokenizer.SPACE, returnState);
} else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
}
// ]NOCPP]
} else if (value == 0x0) {
errNcrZero();
emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
} else if ((value & 0xF800) == 0xD800) {
errNcrSurrogate();
emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
} else {
/*
* Otherwise, return a character token for the Unicode character
* whose code point is that number.
*/
char ch = (char) value;
// [NOCPP[
if (value == 0x0D) {
errNcrCr();
} else if ((value <= 0x0008) || (value == 0x000B)
|| (value >= 0x000E && value <= 0x001F)) {
ch = errNcrControlChar(ch);
} else if (value >= 0xFDD0 && value <= 0xFDEF) {
errNcrUnassigned();
} else if ((value & 0xFFFE) == 0xFFFE) {
ch = errNcrNonCharacter(ch);
} else if (value >= 0x007F && value <= 0x009F) {
errNcrControlChar();
} else {
maybeWarnPrivateUse(ch);
}
// ]NOCPP]
bmpChar[0] = ch;
emitOrAppendOne(bmpChar, returnState);
}
} else if (value <= 0x10FFFF) {
// [NOCPP[
maybeWarnPrivateUseAstral();
if ((value & 0xFFFE) == 0xFFFE) {
errAstralNonCharacter(value);
}
// ]NOCPP]
astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
emitOrAppendTwo(astralChar, returnState);
} else {
errNcrOutOfRange();
emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
}
}
public void eof() throws SAXException {
int state = stateSave;
int returnState = returnStateSave;
eofloop: for (;;) {
switch (state) {
case SCRIPT_DATA_LESS_THAN_SIGN:
case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
/*
* Otherwise, emit a U+003C LESS-THAN SIGN character token
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
/*
* and reconsume the current input character in the data
* state.
*/
break eofloop;
case TAG_OPEN:
/*
* The behavior of this state depends on the content model
* flag.
*/
/*
* Anything else Parse error.
*/
errEofAfterLt();
/*
* Emit a U+003C LESS-THAN SIGN character token
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
/*
* and reconsume the current input character in the data
* state.
*/
break eofloop;
case RAWTEXT_RCDATA_LESS_THAN_SIGN:
/*
* Emit a U+003C LESS-THAN SIGN character token
*/
tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
/*
* and reconsume the current input character in the RCDATA
* state.
*/
break eofloop;
case NON_DATA_END_TAG_NAME:
/*
* Emit a U+003C LESS-THAN SIGN character token, a U+002F
* SOLIDUS character token,
*/
tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
/*
* a character token for each of the characters in the
* temporary buffer (in the order they were added to the
* buffer),
*/
emitStrBuf();
/*
* and reconsume the current input character in the RCDATA
* state.
*/
break eofloop;
case CLOSE_TAG_OPEN:
/* EOF Parse error. */
errEofAfterLt();
/*
* Emit a U+003C LESS-THAN SIGN character token and a U+002F
* SOLIDUS character token.
*/
tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case TAG_NAME:
/*
* EOF Parse error.
*/
errEofInTagName();
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case BEFORE_ATTRIBUTE_NAME:
case AFTER_ATTRIBUTE_VALUE_QUOTED:
case SELF_CLOSING_START_TAG:
/* EOF Parse error. */
errEofWithoutGt();
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case ATTRIBUTE_NAME:
/*
* EOF Parse error.
*/
errEofInAttributeName();
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case AFTER_ATTRIBUTE_NAME:
case BEFORE_ATTRIBUTE_VALUE:
/* EOF Parse error. */
errEofWithoutGt();
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
case ATTRIBUTE_VALUE_SINGLE_QUOTED:
case ATTRIBUTE_VALUE_UNQUOTED:
/* EOF Parse error. */
errEofInAttributeValue();
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case BOGUS_COMMENT:
emitComment(0, 0);
break eofloop;
case BOGUS_COMMENT_HYPHEN:
// [NOCPP[
maybeAppendSpaceToBogusComment();
// ]NOCPP]
emitComment(0, 0);
break eofloop;
case MARKUP_DECLARATION_OPEN:
errBogusComment();
emitComment(0, 0);
break eofloop;
case MARKUP_DECLARATION_HYPHEN:
errBogusComment();
emitComment(0, 0);
break eofloop;
case MARKUP_DECLARATION_OCTYPE:
if (index < 6) {
errBogusComment();
emitComment(0, 0);
} else {
/* EOF Parse error. */
errEofInDoctype();
/*
* Create a new DOCTYPE token. Set its force-quirks flag
* to on.
*/
doctypeName = "";
if (systemIdentifier != null) {
Portability.releaseString(systemIdentifier);
systemIdentifier = null;
}
if (publicIdentifier != null) {
Portability.releaseString(publicIdentifier);
publicIdentifier = null;
}
forceQuirks = true;
/*
* Emit the token.
*/
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
}
break eofloop;
case COMMENT_START:
case COMMENT:
/*
* EOF Parse error.
*/
errEofInComment();
/* Emit the comment token. */
emitComment(0, 0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case COMMENT_END:
errEofInComment();
/* Emit the comment token. */
emitComment(2, 0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case COMMENT_END_DASH:
case COMMENT_START_DASH:
errEofInComment();
/* Emit the comment token. */
emitComment(1, 0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case COMMENT_END_BANG:
errEofInComment();
/* Emit the comment token. */
emitComment(3, 0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case DOCTYPE:
case BEFORE_DOCTYPE_NAME:
errEofInDoctype();
/*
* Create a new DOCTYPE token. Set its force-quirks flag to
* on.
*/
forceQuirks = true;
/*
* Emit the token.
*/
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case DOCTYPE_NAME:
errEofInDoctype();
strBufToDoctypeName();
/*
* Set the DOCTYPE token's force-quirks flag to on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case DOCTYPE_UBLIC:
case DOCTYPE_YSTEM:
case AFTER_DOCTYPE_NAME:
case AFTER_DOCTYPE_PUBLIC_KEYWORD:
case AFTER_DOCTYPE_SYSTEM_KEYWORD:
case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
errEofInDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
/* EOF Parse error. */
errEofInPublicId();
/*
* Set the DOCTYPE token's force-quirks flag to on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
publicIdentifier = strBufToString();
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
errEofInDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
/* EOF Parse error. */
errEofInSystemId();
/*
* Set the DOCTYPE token's force-quirks flag to on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
systemIdentifier = strBufToString();
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
errEofInDoctype();
/*
* Set the DOCTYPE token's force-quirks flag to on.
*/
forceQuirks = true;
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case BOGUS_DOCTYPE:
/*
* Emit that DOCTYPE token.
*/
emitDoctypeToken(0);
/*
* Reconsume the EOF character in the data state.
*/
break eofloop;
case CONSUME_CHARACTER_REFERENCE:
/*
* Unlike the definition is the spec, this state does not
* return a value and never requires the caller to
* backtrack. This state takes care of emitting characters
* or appending to the current attribute value. It also
* takes care of that in the case when consuming the entity
* fails.
*/
/*
* This section defines how to consume an entity. This
* definition is used when parsing entities in text and in
* attributes.
*
* The behavior depends on the identity of the next
* character (the one immediately after the U+0026 AMPERSAND
* character):
*/
emitOrAppendCharRefBuf(returnState);
state = returnState;
continue;
case CHARACTER_REFERENCE_HILO_LOOKUP:
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
state = returnState;
continue;
case CHARACTER_REFERENCE_TAIL:
outer: for (;;) {
char c = '\u0000';
entCol++;
/*
* Consume the maximum number of characters possible,
* with the consumed characters matching one of the
* identifiers in the first column of the named
* character references table (in a case-sensitive
* manner).
*/
hiloop: for (;;) {
if (hi == -1) {
break hiloop;
}
if (entCol == NamedCharacters.NAMES[hi].length()) {
break hiloop;
}
if (entCol > NamedCharacters.NAMES[hi].length()) {
break outer;
} else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
hi--;
} else {
break hiloop;
}
}
loloop: for (;;) {
if (hi < lo) {
break outer;
}
if (entCol == NamedCharacters.NAMES[lo].length()) {
candidate = lo;
charRefBufMark = charRefBufLen;
lo++;
} else if (entCol > NamedCharacters.NAMES[lo].length()) {
break outer;
} else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
lo++;
} else {
break loloop;
}
}
if (hi < lo) {
break outer;
}
continue;
}
if (candidate == -1) {
/*
* If no match can be made, then this is a parse error.
*/
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
state = returnState;
continue eofloop;
} else {
@Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
if (candidateName.length() == 0
|| candidateName.charAt(candidateName.length() - 1) != ';') {
/*
* If the last character matched is not a U+003B
* SEMICOLON (;), there is a parse error.
*/
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
/*
* If the entity is being consumed as part of an
* attribute, and the last character matched is
* not a U+003B SEMICOLON (;),
*/
char ch;
if (charRefBufMark == charRefBufLen) {
ch = '\u0000';
} else {
ch = charRefBuf[charRefBufMark];
}
if ((ch >= '0' && ch <= '9')
|| (ch >= 'A' && ch <= 'Z')
|| (ch >= 'a' && ch <= 'z')) {
/*
* and the next character is in the range
* U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
* U+0041 LATIN CAPITAL LETTER A to U+005A
* LATIN CAPITAL LETTER Z, or U+0061 LATIN
* SMALL LETTER A to U+007A LATIN SMALL
* LETTER Z, then, for historical reasons,
* all the characters that were matched
* after the U+0026 AMPERSAND (&) must be
* unconsumed, and nothing is returned.
*/
errNoNamedCharacterMatch();
appendCharRefBufToStrBuf();
state = returnState;
continue eofloop;
}
}
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
errUnescapedAmpersandInterpretedAsCharacterReference();
} else {
errNotSemicolonTerminated();
}
}
/*
* Otherwise, return a character token for the character
* corresponding to the entity name (as given by the
* second column of the named character references
* table).
*/
@Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
if (
// [NOCPP[
val.length == 1
// ]NOCPP]
// CPPONLY: val[1] == 0
) {
emitOrAppendOne(val, returnState);
} else {
emitOrAppendTwo(val, returnState);
}
// this is so complicated!
if (charRefBufMark < charRefBufLen) {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendStrBuf(charRefBuf, charRefBufMark,
charRefBufLen - charRefBufMark);
} else {
tokenHandler.characters(charRefBuf, charRefBufMark,
charRefBufLen - charRefBufMark);
}
}
charRefBufLen = 0;
state = returnState;
continue eofloop;
/*
* If the markup contains I'm &notit; I tell you, the
* entity is parsed as "not", as in, I'm ¬it; I tell
* you. But if the markup was I'm &notin; I tell you,
* the entity would be parsed as "notin;", resulting in
* I'm ∉ I tell you.
*/
}
case CONSUME_NCR:
case DECIMAL_NRC_LOOP:
case HEX_NCR_LOOP:
/*
* If no characters match the range, then don't consume any
* characters (and unconsume the U+0023 NUMBER SIGN
* character and, if appropriate, the X character). This is
* a parse error; nothing is returned.
*
* Otherwise, if the next character is a U+003B SEMICOLON,
* consume that too. If it isn't, there is a parse error.
*/
if (!seenDigits) {
errNoDigitsInNCR();
emitOrAppendCharRefBuf(returnState);
state = returnState;
continue;
} else {
errCharRefLacksSemicolon();
}
// WARNING previous state sets reconsume
handleNcrValue(returnState);
state = returnState;
continue;
case CDATA_RSQB:
tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
break eofloop;
case CDATA_RSQB_RSQB:
tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
break eofloop;
case DATA:
default:
break eofloop;
}
}
// case DATA:
/*
* EOF Emit an end-of-file token.
*/
tokenHandler.eof();
return;
}
private void emitDoctypeToken(int pos) throws SAXException {
cstart = pos + 1;
tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
forceQuirks);
// It is OK and sufficient to release these here, since
// there's no way out of the doctype states than through paths
// that call this method.
doctypeName = null;
Portability.releaseString(publicIdentifier);
publicIdentifier = null;
Portability.releaseString(systemIdentifier);
systemIdentifier = null;
}
@Inline protected char checkChar(@NoLength char[] buf, int pos)
throws SAXException {
return buf[pos];
}
public boolean internalEncodingDeclaration(String internalCharset)
throws SAXException {
if (encodingDeclarationHandler != null) {
return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
}
return false;
}
/**
* @param val
* @throws SAXException
*/
private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
throws SAXException {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendStrBuf(val[0]);
appendStrBuf(val[1]);
} else {
tokenHandler.characters(val, 0, 2);
}
}
private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
throws SAXException {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendStrBuf(val[0]);
} else {
tokenHandler.characters(val, 0, 1);
}
}
public void end() throws SAXException {
strBuf = null;
doctypeName = null;
if (systemIdentifier != null) {
Portability.releaseString(systemIdentifier);
systemIdentifier = null;
}
if (publicIdentifier != null) {
Portability.releaseString(publicIdentifier);
publicIdentifier = null;
}
if (tagName != null) {
tagName.release();
tagName = null;
}
if (attributeName != null) {
attributeName.release();
attributeName = null;
}
tokenHandler.endTokenization();
if (attributes != null) {
// [NOCPP[
attributes = null;
// ]NOCPP]
// CPPONLY: attributes.clear(mappingLangToXmlLang);
}
}
public void requestSuspension() {
shouldSuspend = true;
}
// [NOCPP[
public void becomeConfident() {
confident = true;
}
/**
* Returns the nextCharOnNewLine.
*
* @return the nextCharOnNewLine
*/
public boolean isNextCharOnNewLine() {
return false;
}
public boolean isPrevCR() {
return lastCR;
}
/**
* Returns the line.
*
* @return the line
*/
public int getLine() {
return -1;
}
/**
* Returns the col.
*
* @return the col
*/
public int getCol() {
return -1;
}
// ]NOCPP]
public boolean isInDataState() {
return (stateSave == DATA);
}
public void resetToDataState() {
clearStrBufAfterUse();
charRefBufLen = 0;
stateSave = Tokenizer.DATA;
// line = 1; XXX line numbers
lastCR = false;
index = 0;
forceQuirks = false;
additional = '\u0000';
entCol = -1;
firstCharKey = -1;
lo = 0;
hi = 0; // will always be overwritten before use anyway
candidate = -1;
charRefBufMark = 0;
value = 0;
seenDigits = false;
endTag = false;
shouldSuspend = false;
initDoctypeFields();
if (tagName != null) {
tagName.release();
tagName = null;
}
if (attributeName != null) {
attributeName.release();
attributeName = null;
}
if (newAttributesEachTime) {
if (attributes != null) {
Portability.delete(attributes);
attributes = null;
}
}
}
public void loadState(Tokenizer other) throws SAXException {
strBufLen = other.strBufLen;
if (strBufLen > strBuf.length) {
strBuf = new char[strBufLen];
}
System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
charRefBufLen = other.charRefBufLen;
System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen);
stateSave = other.stateSave;
returnStateSave = other.returnStateSave;
endTagExpectation = other.endTagExpectation;
endTagExpectationAsArray = other.endTagExpectationAsArray;
// line = 1; XXX line numbers
lastCR = other.lastCR;
index = other.index;
forceQuirks = other.forceQuirks;
additional = other.additional;
entCol = other.entCol;
firstCharKey = other.firstCharKey;
lo = other.lo;
hi = other.hi;
candidate = other.candidate;
charRefBufMark = other.charRefBufMark;
value = other.value;
seenDigits = other.seenDigits;
endTag = other.endTag;
shouldSuspend = false;
if (other.doctypeName == null) {
doctypeName = null;
} else {
doctypeName = Portability.newLocalFromLocal(other.doctypeName,
interner);
}
Portability.releaseString(systemIdentifier);
if (other.systemIdentifier == null) {
systemIdentifier = null;
} else {
systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
}
Portability.releaseString(publicIdentifier);
if (other.publicIdentifier == null) {
publicIdentifier = null;
} else {
publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
}
if (tagName != null) {
tagName.release();
}
if (other.tagName == null) {
tagName = null;
} else {
tagName = other.tagName.cloneElementName(interner);
}
if (attributeName != null) {
attributeName.release();
}
if (other.attributeName == null) {
attributeName = null;
} else {
attributeName = other.attributeName.cloneAttributeName(interner);
}
Portability.delete(attributes);
if (other.attributes == null) {
attributes = null;
} else {
attributes = other.attributes.cloneAttributes(interner);
}
}
public void initializeWithoutStarting() throws SAXException {
confident = false;
strBuf = null;
line = 1;
// CPPONLY: attributeLine = 1;
// [NOCPP[
html4 = false;
metaBoundaryPassed = false;
wantsComments = tokenHandler.wantsComments();
if (!newAttributesEachTime) {
attributes = new HtmlAttributes(mappingLangToXmlLang);
}
// ]NOCPP]
resetToDataState();
}
protected void errGarbageAfterLtSlash() throws SAXException {
}
protected void errLtSlashGt() throws SAXException {
}
protected void errWarnLtSlashInRcdata() throws SAXException {
}
protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
}
protected void errCharRefLacksSemicolon() throws SAXException {
}
protected void errNoDigitsInNCR() throws SAXException {
}
protected void errGtInSystemId() throws SAXException {
}
protected void errGtInPublicId() throws SAXException {
}
protected void errNamelessDoctype() throws SAXException {
}
protected void errConsecutiveHyphens() throws SAXException {
}
protected void errPrematureEndOfComment() throws SAXException {
}
protected void errBogusComment() throws SAXException {
}
protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
}
protected void errSlashNotFollowedByGt() throws SAXException {
}
protected void errHtml4XmlVoidSyntax() throws SAXException {
}
protected void errNoSpaceBetweenAttributes() throws SAXException {
}
protected void errHtml4NonNameInUnquotedAttribute(char c)
throws SAXException {
}
protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
throws SAXException {
}
protected void errAttributeValueMissing() throws SAXException {
}
protected void errBadCharBeforeAttributeNameOrNull(char c)
throws SAXException {
}
protected void errEqualsSignBeforeAttributeName() throws SAXException {
}
protected void errBadCharAfterLt(char c) throws SAXException {
}
protected void errLtGt() throws SAXException {
}
protected void errProcessingInstruction() throws SAXException {
}
protected void errUnescapedAmpersandInterpretedAsCharacterReference()
throws SAXException {
}
protected void errNotSemicolonTerminated() throws SAXException {
}
protected void errNoNamedCharacterMatch() throws SAXException {
}
protected void errQuoteBeforeAttributeName(char c) throws SAXException {
}
protected void errQuoteOrLtInAttributeNameOrNull(char c)
throws SAXException {
}
protected void errExpectedPublicId() throws SAXException {
}
protected void errBogusDoctype() throws SAXException {
}
protected void maybeWarnPrivateUseAstral() throws SAXException {
}
protected void maybeWarnPrivateUse(char ch) throws SAXException {
}
protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
throws SAXException {
}
protected void maybeErrSlashInEndTag(boolean selfClosing)
throws SAXException {
}
protected char errNcrNonCharacter(char ch) throws SAXException {
return ch;
}
protected void errAstralNonCharacter(int ch) throws SAXException {
}
protected void errNcrSurrogate() throws SAXException {
}
protected char errNcrControlChar(char ch) throws SAXException {
return ch;
}
protected void errNcrCr() throws SAXException {
}
protected void errNcrInC1Range() throws SAXException {
}
protected void errEofInPublicId() throws SAXException {
}
protected void errEofInComment() throws SAXException {
}
protected void errEofInDoctype() throws SAXException {
}
protected void errEofInAttributeValue() throws SAXException {
}
protected void errEofInAttributeName() throws SAXException {
}
protected void errEofWithoutGt() throws SAXException {
}
protected void errEofInTagName() throws SAXException {
}
protected void errEofInEndTag() throws SAXException {
}
protected void errEofAfterLt() throws SAXException {
}
protected void errNcrOutOfRange() throws SAXException {
}
protected void errNcrUnassigned() throws SAXException {
}
protected void errDuplicateAttribute() throws SAXException {
}
protected void errEofInSystemId() throws SAXException {
}
protected void errExpectedSystemId() throws SAXException {
}
protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
}
protected void errHyphenHyphenBang() throws SAXException {
}
protected void errNcrControlChar() throws SAXException {
}
protected void errNcrZero() throws SAXException {
}
protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
throws SAXException {
}
protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
}
protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
throws SAXException {
}
protected void noteAttributeWithoutValue() throws SAXException {
}
protected void noteUnquotedAttributeValue() throws SAXException {
}
/**
* Sets the encodingDeclarationHandler.
*
* @param encodingDeclarationHandler
* the encodingDeclarationHandler to set
*/
public void setEncodingDeclarationHandler(
EncodingDeclarationHandler encodingDeclarationHandler) {
this.encodingDeclarationHandler = encodingDeclarationHandler;
}
void destructor() {
// The translator will write refcount tracing stuff here
Portability.delete(attributes);
attributes = null;
}
// [NOCPP[
/**
* Sets an offset to be added to the position reported to
* <code>TransitionHandler</code>.
*
* @param offset the offset
*/
public void setTransitionBaseOffset(int offset) {
}
// ]NOCPP]
}