From e5ac89f389d9592504691066455f3e3426742cd2 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Wed, 19 Oct 2016 10:38:20 +0300 Subject: [PATCH] Bug 1309195 - Mark strBuf as empty after the contents have been used or ignored. r=wchen. MozReview-Commit-ID: 5KU62o2IMPs --- parser/html/javasrc/Tokenizer.java | 413 ++++++++++++++++------------- parser/html/nsHtml5Tokenizer.cpp | 149 +++++++---- parser/html/nsHtml5Tokenizer.h | 48 ++-- 3 files changed, 354 insertions(+), 256 deletions(-) diff --git a/parser/html/javasrc/Tokenizer.java b/parser/html/javasrc/Tokenizer.java index 9754196b068a..d9eaafeb3e14 100644 --- a/parser/html/javasrc/Tokenizer.java +++ b/parser/html/javasrc/Tokenizer.java @@ -1,40 +1,45 @@ /* * Copyright (c) 2005-2007 Henri Sivonen * Copyright (c) 2007-2015 Mozilla Foundation - * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla + * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla * Foundation, and Opera Software ASA. * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in + * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ /* - * The comments following this one that use the same comment syntax as this - * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 + * The comments following this one that use the same comment syntax as this + * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 * amended as of June 18 2008 and May 31 2010. * That document came with this statement: - * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and - * Opera Software ASA. You are granted a license to use, reproduce and + * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and + * Opera Software ASA. You are granted a license to use, reproduce and * create derivative works of this document." */ package nu.validator.htmlparser.impl; +import org.xml.sax.ErrorHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + import nu.validator.htmlparser.annotation.Auto; import nu.validator.htmlparser.annotation.CharacterName; import nu.validator.htmlparser.annotation.Const; @@ -46,23 +51,18 @@ import nu.validator.htmlparser.common.Interner; import nu.validator.htmlparser.common.TokenHandler; import nu.validator.htmlparser.common.XmlViolationPolicy; -import org.xml.sax.ErrorHandler; -import org.xml.sax.Locator; -import org.xml.sax.SAXException; -import org.xml.sax.SAXParseException; - /** * An implementation of * https://html.spec.whatwg.org/multipage/syntax.html#tokenization - * + * * This class implements the Locator interface. This is not an * incidental implementation detail: Users of this class are encouraged to make * use of the Locator nature. - * + * * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer * can be configured to treat these conditions as fatal or to coerce the infoset * to something that XML 1.0 allows. - * + * * @version $Id$ * @author hsivonen */ @@ -529,12 +529,12 @@ public class Tokenizer implements Locator { /** * The constructor. - * + * * @param tokenHandler * the handler for receiving tokens */ public Tokenizer(TokenHandler tokenHandler - // CPPONLY: , boolean viewingXmlSource + // CPPONLY: , boolean viewingXmlSource ) { this.tokenHandler = tokenHandler; this.encodingDeclarationHandler = null; @@ -577,7 +577,7 @@ public class Tokenizer implements Locator { /** * Returns the mappingLangToXmlLang. - * + * * @return the mappingLangToXmlLang */ public boolean isMappingLangToXmlLang() { @@ -586,7 +586,7 @@ public class Tokenizer implements Locator { /** * Sets the mappingLangToXmlLang. - * + * * @param mappingLangToXmlLang * the mappingLangToXmlLang to set */ @@ -597,7 +597,7 @@ public class Tokenizer implements Locator { /** * Sets the error handler. - * + * * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) */ public void setErrorHandler(ErrorHandler eh) { @@ -610,7 +610,7 @@ public class Tokenizer implements Locator { /** * Sets the commentPolicy. - * + * * @param commentPolicy * the commentPolicy to set */ @@ -620,7 +620,7 @@ public class Tokenizer implements Locator { /** * Sets the contentNonXmlCharPolicy. - * + * * @param contentNonXmlCharPolicy * the contentNonXmlCharPolicy to set */ @@ -634,7 +634,7 @@ public class Tokenizer implements Locator { /** * Sets the contentSpacePolicy. - * + * * @param contentSpacePolicy * the contentSpacePolicy to set */ @@ -644,7 +644,7 @@ public class Tokenizer implements Locator { /** * Sets the xmlnsPolicy. - * + * * @param xmlnsPolicy * the xmlnsPolicy to set */ @@ -661,7 +661,7 @@ public class Tokenizer implements Locator { /** * Sets the html4ModeCompatibleWithXhtml1Schemata. - * + * * @param html4ModeCompatibleWithXhtml1Schemata * the html4ModeCompatibleWithXhtml1Schemata to set */ @@ -674,10 +674,10 @@ public class Tokenizer implements Locator { // For the token handler to call /** - * Sets the tokenizer state and the associated element name. This should + * Sets the tokenizer state and the associated element name. This should * only ever used to put the tokenizer into one of the states that have * a special end tag expectation. - * + * * @param specialTokenizerState * the tokenizer state to set * @param endTagExpectation @@ -696,10 +696,10 @@ public class Tokenizer implements Locator { } /** - * Sets the tokenizer state and the associated element name. This should + * Sets the tokenizer state and the associated element name. This should * only ever used to put the tokenizer into one of the states that have * a special end tag expectation. - * + * * @param specialTokenizerState * the tokenizer state to set * @param endTagExpectation @@ -822,33 +822,35 @@ public class Tokenizer implements Locator { charRefBuf[charRefBufLen++] = c; } - @Inline private void clearCharRefBufAndAppend(char c) { - charRefBuf[0] = c; - charRefBufLen = 1; - } - private void emitOrAppendCharRefBuf(int returnState) throws SAXException { if ((returnState & DATA_AND_RCDATA_MASK) != 0) { appendCharRefBufToStrBuf(); } else { if (charRefBufLen > 0) { tokenHandler.characters(charRefBuf, 0, charRefBufLen); + charRefBufLen = 0; } } } - @Inline private void clearStrBufAndAppend(char c) { - strBuf[0] = c; - strBufLen = 1; + @Inline private void clearStrBufAfterUse() { + strBufLen = 0; } - @Inline private void clearStrBuf() { + @Inline private void clearStrBufBeforeUse() { + assert strBufLen == 0: "strBufLen not reset after previous use!"; + strBufLen = 0; // no-op in the absence of bugs + } + + @Inline private void clearStrBufAfterOneHyphen() { + assert strBufLen == 1: "strBufLen length not one!"; + assert strBuf[0] == '-': "strBuf does not start with a hyphen!"; strBufLen = 0; } /** * Appends to the buffer. - * + * * @param c * the UTF-16 code unit to append */ @@ -864,38 +866,42 @@ public class Tokenizer implements Locator { /** * The buffer as a String. Currently only used for error reporting. - * + * *

* C++ memory note: The return value must be released. - * + * * @return the buffer as a string */ protected String strBufToString() { - return Portability.newStringFromBuffer(strBuf, 0, strBufLen + String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen // CPPONLY: , tokenHandler ); + clearStrBufAfterUse(); + return str; } /** * Returns the buffer as a local name. The return value is released in * emitDoctypeToken(). - * + * * @return the buffer as local name */ private void strBufToDoctypeName() { doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen, interner); + clearStrBufAfterUse(); } /** * Emits the buffer as character tokens. - * + * * @throws SAXException * if the token handler threw */ private void emitStrBuf() throws SAXException { if (strBufLen > 0) { tokenHandler.characters(strBuf, 0, strBufLen); + clearStrBufAfterUse(); } } @@ -942,6 +948,8 @@ public class Tokenizer implements Locator { switch (commentPolicy) { case ALTER_INFOSET: strBufLen--; + // WARNING!!! This expands the worst case of the buffer length + // given the length of input! appendStrBuf(' '); appendStrBuf('-'); // FALLTHROUGH @@ -975,14 +983,15 @@ public class Tokenizer implements Locator { */ @Inline private void appendCharRefBufToStrBuf() { appendStrBuf(charRefBuf, 0, charRefBufLen); + charRefBufLen = 0; } /** * Emits the current comment token. - * + * * @param pos * TODO - * + * * @throws SAXException */ private void emitComment(int provisionalHyphens, int pos) @@ -995,17 +1004,18 @@ public class Tokenizer implements Locator { // [NOCPP[ } // ]NOCPP] + clearStrBufAfterUse(); cstart = pos + 1; } /** * Flushes coalesced character tokens. - * + * * @param buf * TODO * @param pos * TODO - * + * * @throws SAXException */ protected void flushChars(@NoLength char[] buf, int pos) @@ -1019,7 +1029,7 @@ public class Tokenizer implements Locator { /** * Reports an condition that would make the infoset incompatible with XML * 1.0 as fatal. - * + * * @param message * the message * @throws SAXException @@ -1035,7 +1045,7 @@ public class Tokenizer implements Locator { /** * Reports a Parse Error. - * + * * @param message * the message * @throws SAXException @@ -1066,7 +1076,7 @@ public class Tokenizer implements Locator { /** * Reports a warning - * + * * @param message * the message * @throws SAXException @@ -1082,6 +1092,7 @@ public class Tokenizer implements Locator { private void strBufToElementNameString() { tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen, interner); + clearStrBufAfterUse(); } private int emitCurrentTagToken(boolean selfClosing, int pos) @@ -1133,6 +1144,7 @@ public class Tokenizer implements Locator { , namePolicy != XmlViolationPolicy.ALLOW // ]NOCPP] , interner); + clearStrBufAfterUse(); if (attributes == null) { attributes = new HtmlAttributes(mappingLangToXmlLang); @@ -1199,6 +1211,8 @@ public class Tokenizer implements Locator { // ]NOCPP] attributeName = null; // attributeName has been adopted by the // |attributes| object + } else { + clearStrBufAfterUse(); } } @@ -1229,6 +1243,9 @@ public class Tokenizer implements Locator { ); attributeName = null; // attributeName has been adopted by the // |attributes| object + } else { + // We have a duplicate attribute. Explicitly discard its value. + clearStrBufAfterUse(); } } @@ -1254,7 +1271,7 @@ public class Tokenizer implements Locator { } // ]NOCPP] - + public void start() throws SAXException { initializeWithoutStarting(); tokenHandler.startTokenization(this); @@ -1347,6 +1364,14 @@ public class Tokenizer implements Locator { // unifying the tokenizer and tree builder buffers in the future. int worstCase = strBufLen + inputLength + charRefBufLen + 2; tokenHandler.ensureBufferSpace(worstCase); + if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) { + // When altering infoset, if the comment contents are consecutive + // hyphens, each hyphen generates a space, too. These buffer + // contents never get emitted as characters() to the tokenHandler, + // which is why this calculation happens after the call to + // ensureBufferSpace on tokenHandler. + worstCase *= 2; + } if (strBuf == null) { // Add an arbitrary small value to avoid immediate reallocation // once there are a few characters in the buffer. @@ -1370,66 +1395,66 @@ public class Tokenizer implements Locator { int endPos) throws SAXException { /* * Idioms used in this code: - * - * + * + * * Consuming the next input character - * + * * To consume the next input character, the code does this: if (++pos == * endPos) { break stateloop; } c = checkChar(buf, pos); - * - * + * + * * Staying in a state - * + * * When there's a state that the tokenizer may stay in over multiple * input characters, the state has a wrapper |for(;;)| loop and staying * in the state continues the loop. - * - * + * + * * Switching to another state - * + * * To switch to another state, the code sets the state variable to the * magic number of the new state. Then it either continues stateloop or * breaks out of the state's own wrapper loop if the target state is * right after the current state in source order. (This is a partial * workaround for Java's lack of goto.) - * - * + * + * * Reconsume support - * + * * The spec sometimes says that an input character is reconsumed in * another state. If a state can ever be entered so that an input * character can be reconsumed in it, the state's code starts with an * |if (reconsume)| that sets reconsume to false and skips over the * normal code for consuming a new character. - * + * * To reconsume the current character in another state, the code sets * |reconsume| to true and then switches to the other state. - * - * + * + * * Emitting character tokens - * + * * This method emits character tokens lazily. Whenever a new range of * character tokens starts, the field cstart must be set to the start * index of the range. The flushChars() method must be called at the end * of a range to flush it. - * - * + * + * * U+0000 handling - * + * * The various states have to handle the replacement of U+0000 with * U+FFFD. However, if U+0000 would be reconsumed in another state, the * replacement doesn't need to happen, because it's handled by the * reconsuming state. - * - * + * + * * LF handling - * + * * Every state needs to increment the line number upon LF unless the LF * gets reconsumed by another state which increments the line number. - * - * + * + * * CR handling - * + * * Every state needs to handle CR unless the CR gets reconsumed and is * handled by the reconsuming state. The CR needs to be handled as if it * were and LF, the lastCR field must be set to true and then this @@ -1455,7 +1480,8 @@ public class Tokenizer implements Locator { * reference in data state. */ flushChars(buf, pos); - clearCharRefBufAndAppend(c); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('\u0000'); returnState = state; state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -1482,7 +1508,7 @@ public class Tokenizer implements Locator { /* * Anything else Emit the input character as a * character token. - * + * * Stay in the data state. */ continue; @@ -1515,7 +1541,8 @@ public class Tokenizer implements Locator { * input character (add 0x0020 to the character's * code point), */ - clearStrBufAndAppend((char) (c + 0x20)); + clearStrBufBeforeUse(); + appendStrBuf((char) (c + 0x20)); /* then switch to the tag name state. */ state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); /* @@ -1534,7 +1561,8 @@ public class Tokenizer implements Locator { /* * set its tag name to the input character, */ - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); /* then switch to the tag name state. */ state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); /* @@ -1574,7 +1602,8 @@ public class Tokenizer implements Locator { /* * Switch to the bogus comment state. */ - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); continue stateloop; case '>': @@ -1777,7 +1806,8 @@ public class Tokenizer implements Locator { * Set that attribute's name to the current * input character, */ - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); /* * and its value to the empty string. */ @@ -1923,7 +1953,7 @@ public class Tokenizer implements Locator { * attribute value (double-quoted) state. */ // CPPONLY: attributeLine = line; - clearStrBuf(); + clearStrBufBeforeUse(); state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); break beforeattributevalueloop; // continue stateloop; @@ -1934,7 +1964,7 @@ public class Tokenizer implements Locator { * input character. */ // CPPONLY: attributeLine = line; - clearStrBuf(); + clearStrBufBeforeUse(); reconsume = true; state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); noteUnquotedAttributeValue(); @@ -1945,7 +1975,7 @@ public class Tokenizer implements Locator { * value (single-quoted) state. */ // CPPONLY: attributeLine = line; - clearStrBuf(); + clearStrBufBeforeUse(); state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); continue stateloop; case '>': @@ -1989,7 +2019,8 @@ public class Tokenizer implements Locator { * character to the current attribute's value. */ // CPPONLY: attributeLine = line; - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); /* * Switch to the attribute value (unquoted) * state. @@ -2032,7 +2063,8 @@ public class Tokenizer implements Locator { * additional allowed character being U+0022 * QUOTATION MARK ("). */ - clearCharRefBufAndAppend(c); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('\"'); returnState = state; state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -2201,7 +2233,8 @@ public class Tokenizer implements Locator { * additional allowed character being U+003E * GREATER-THAN SIGN (>) */ - clearCharRefBufAndAppend(c); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('>'); returnState = state; state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -2340,7 +2373,8 @@ public class Tokenizer implements Locator { * Set that attribute's name to the current * input character, */ - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); /* * and its value to the empty string. */ @@ -2364,12 +2398,12 @@ public class Tokenizer implements Locator { * HYPHEN-MINUS characters (-), consume those two * characters, create a comment token whose data is the * empty string, and switch to the comment start state. - * + * * Otherwise, if the next seven characters are an ASCII * case-insensitive match for the word "DOCTYPE", then * consume those characters and switch to the DOCTYPE * state. - * + * * Otherwise, if the insertion mode is * "in foreign content" and the current node is not an * element in the HTML namespace and the next seven @@ -2378,7 +2412,7 @@ public class Tokenizer implements Locator { * U+005B LEFT SQUARE BRACKET character before and * after), then consume those characters and switch to * the CDATA section state. - * + * * Otherwise, is is a parse error. Switch to the bogus * comment state. The next character that is consumed, * if any, is the first character that will be in the @@ -2386,19 +2420,22 @@ public class Tokenizer implements Locator { */ switch (c) { case '-': - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); break markupdeclarationopenloop; // continue stateloop; case 'd': case 'D': - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); index = 0; state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); continue stateloop; case '[': if (tokenHandler.cdataSectionAllowed()) { - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); index = 0; state = transition(state, Tokenizer.CDATA_START, reconsume, pos); continue stateloop; @@ -2406,7 +2443,7 @@ public class Tokenizer implements Locator { // else fall through default: errBogusComment(); - clearStrBuf(); + clearStrBufBeforeUse(); reconsume = true; state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); continue stateloop; @@ -2423,7 +2460,7 @@ public class Tokenizer implements Locator { case '\u0000': break stateloop; case '-': - clearStrBuf(); + clearStrBufAfterOneHyphen(); state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); break markupdeclarationhyphenloop; // continue stateloop; @@ -2443,8 +2480,8 @@ public class Tokenizer implements Locator { c = checkChar(buf, pos); /* * Comment start state - * - * + * + * * Consume the next input character: */ switch (c) { @@ -2653,7 +2690,7 @@ public class Tokenizer implements Locator { c = checkChar(buf, pos); /* * Comment end bang state - * + * * Consume the next input character: */ switch (c) { @@ -2713,7 +2750,7 @@ public class Tokenizer implements Locator { c = checkChar(buf, pos); /* * Comment start dash state - * + * * Consume the next input character: */ switch (c) { @@ -2777,6 +2814,7 @@ public class Tokenizer implements Locator { index++; continue; } else { + clearStrBufAfterUse(); cstart = pos; // start coalescing reconsume = true; state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); @@ -2841,11 +2879,11 @@ public class Tokenizer implements Locator { c = checkChar(buf, pos); switch (c) { case ']': - // Saw a third ]. Emit one ] (logically the - // first one) and stay in this state to + // Saw a third ]. Emit one ] (logically the + // first one) and stay in this state to // remember that the last two characters seen // have been ]]. - tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); continue; case '>': cstart = pos + 1; @@ -2890,7 +2928,8 @@ public class Tokenizer implements Locator { * + additional allowed character being U+0027 * APOSTROPHE ('). */ - clearCharRefBufAndAppend(c); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('\''); returnState = state; state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -2939,7 +2978,7 @@ public class Tokenizer implements Locator { * This section defines how to consume a character * reference. This definition is used when parsing character * references in text and in attributes. - * + * * The behavior depends on the identity of the next * character (the one immediately after the U+0026 AMPERSAND * character): @@ -3010,7 +3049,7 @@ public class Tokenizer implements Locator { } /* * The data structure is as follows: - * + * * HILO_ACCEL is a two-dimensional int array whose major * index corresponds to the second character of the * character reference (code point as index) and the @@ -3021,7 +3060,7 @@ public class Tokenizer implements Locator { * to omit parts of it: The second dimension of the * table is null when no character reference starts with * the character corresponding to that row. - * + * * The int value HILO_ACCEL (by these indeces) is zero * if there exists no character reference starting with * that two-letter prefix. Otherwise, the value is an @@ -3033,14 +3072,14 @@ public class Tokenizer implements Locator { * first two character reference names share their * prefix so the packed int cannot be 0 by packing the * two shorts.) - * + * * NAMES is an array of byte arrays where each byte * array encodes the name of a character references as * ASCII. The names omit the first two letters of the * name. (Since storing the first two letters would be * redundant with the data contained in HILO_ACCEL.) The * entries are lexically sorted. - * + * * For a given index in NAMES, the same index in VALUES * contains the corresponding expansion as an array of * two UTF-16 code units (either the character and @@ -3128,18 +3167,18 @@ public class Tokenizer implements Locator { } if (c == ';') { - // If we see a semicolon, there cannot be a + // If we see a semicolon, there cannot be a // longer match. Break the loop. However, before - // breaking, take the longest match so far as the - // candidate, if we are just about to complete a + // breaking, take the longest match so far as the + // candidate, if we are just about to complete a // match. if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { candidate = lo; charRefBufMark = charRefBufLen; - } + } break outer; } - + if (hi < lo) { break outer; } @@ -3238,13 +3277,16 @@ public class Tokenizer implements Locator { charRefBufLen - charRefBufMark); } } + // charRefBufLen will be zeroed below! + // Check if we broke out early with c being the last // character that matched as opposed to being the - // first one that didn't match. In the case of an + // first one that didn't match. In the case of an // early break, the next run on text should start - // *after* the current character and the current + // *after* the current character and the current // character shouldn't be reconsumed. boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen); + charRefBufLen = 0; if ((returnState & DATA_AND_RCDATA_MASK) == 0) { cstart = earlyBreak ? pos + 1 : pos; } @@ -3278,14 +3320,14 @@ public class Tokenizer implements Locator { /* * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL * LETTER X Consume the X. - * + * * Follow the steps below, but using the range of * characters U+0030 DIGIT ZERO through to U+0039 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL * LETTER F (in other words, 0-9, A-F, a-f). - * + * * When it comes to interpreting the number, * interpret it as a hexadecimal number. */ @@ -3297,7 +3339,7 @@ public class Tokenizer implements Locator { * Anything else Follow the steps below, but using * the range of characters U+0030 DIGIT ZERO through * to U+0039 DIGIT NINE (i.e. just 0-9). - * + * * When it comes to interpreting the number, * interpret it as a decimal number. */ @@ -3354,7 +3396,7 @@ public class Tokenizer implements Locator { * NUMBER SIGN character and, if appropriate, the X * character). This is a parse error; nothing is * returned. - * + * * Otherwise, if the next character is a U+003B * SEMICOLON, consume that too. If it isn't, there * is a parse error. @@ -3383,6 +3425,8 @@ public class Tokenizer implements Locator { // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER case HANDLE_NCR_VALUE: // WARNING previous state sets reconsume + // We are not going to emit the contents of charRefBuf. + charRefBufLen = 0; // XXX inline this case if the method size can take it handleNcrValue(returnState); state = transition(state, returnState, reconsume, pos); @@ -3447,7 +3491,7 @@ public class Tokenizer implements Locator { * NUMBER SIGN character and, if appropriate, the X * character). This is a parse error; nothing is * returned. - * + * * Otherwise, if the next character is a U+003B * SEMICOLON, consume that too. If it isn't, there * is a parse error. @@ -3529,7 +3573,8 @@ public class Tokenizer implements Locator { /* * Switch to the bogus comment state. */ - clearStrBufAndAppend('\n'); + clearStrBufBeforeUse(); + appendStrBuf('\n'); state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); break stateloop; case '\n': @@ -3539,7 +3584,8 @@ public class Tokenizer implements Locator { /* * Switch to the bogus comment state. */ - clearStrBufAndAppend('\n'); + clearStrBufBeforeUse(); + appendStrBuf(c); state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); continue stateloop; case '\u0000': @@ -3559,7 +3605,8 @@ public class Tokenizer implements Locator { /* * set its tag name to the input character, */ - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); /* * then switch to the tag name state. (Don't * emit the token yet; further details will be @@ -3573,7 +3620,8 @@ public class Tokenizer implements Locator { /* * Switch to the bogus comment state. */ - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); continue stateloop; } @@ -3596,7 +3644,8 @@ public class Tokenizer implements Locator { * reference in RCDATA state. */ flushChars(buf, pos); - clearCharRefBufAndAppend(c); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('\u0000'); returnState = state; state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -3681,7 +3730,7 @@ public class Tokenizer implements Locator { * data end tag open state. */ index = 0; - clearStrBuf(); + clearStrBufBeforeUse(); state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); break rawtextrcdatalessthansignloop; // FALL THRU continue stateloop; @@ -3710,7 +3759,7 @@ public class Tokenizer implements Locator { c = checkChar(buf, pos); /* * ASSERT! when entering this state, set index to 0 and - * call clearStrBuf() assert (contentModelElement != + * call clearStrBufBeforeUse() assert (contentModelElement != * null); Let's implement the above without lookahead. * strBuf is the 'temporary buffer'. */ @@ -3743,6 +3792,7 @@ public class Tokenizer implements Locator { switch (c) { case '\r': silentCarriageReturn(); + clearStrBufAfterUse(); // strBuf not used state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); break stateloop; case '\n': @@ -3758,6 +3808,7 @@ public class Tokenizer implements Locator { * appropriate end tag token, then switch to * the before attribute name state. */ + clearStrBufAfterUse(); // strBuf not used state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); continue stateloop; case '/': @@ -3767,6 +3818,7 @@ public class Tokenizer implements Locator { * then switch to the self-closing start tag * state. */ + clearStrBufAfterUse(); // strBuf not used state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); continue stateloop; case '>': @@ -3776,6 +3828,7 @@ public class Tokenizer implements Locator { * end tag token, then emit the current tag * token and switch to the data state. */ + clearStrBufAfterUse(); // strBuf not used state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); if (shouldSuspend) { break stateloop; @@ -3832,9 +3885,9 @@ public class Tokenizer implements Locator { * character (i.e. up to the character just before the * U+003E or EOF character). (If the comment was started * by the end of the file (EOF), the token is empty.) - * + * * Switch to the data state. - * + * * If the end of the file was reached, reconsume the EOF * character. */ @@ -3950,7 +4003,7 @@ public class Tokenizer implements Locator { * data end tag open state. */ index = 0; - clearStrBuf(); + clearStrBufBeforeUse(); state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); continue stateloop; case '!': @@ -4208,7 +4261,7 @@ public class Tokenizer implements Locator { * data escaped end tag open state. */ index = 0; - clearStrBuf(); + clearStrBufBeforeUse(); returnState = Tokenizer.SCRIPT_DATA_ESCAPED; state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); continue stateloop; @@ -4690,7 +4743,8 @@ public class Tokenizer implements Locator { * Set the token's name name to the current * input character. */ - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); /* * Switch to the DOCTYPE name state. */ @@ -4911,7 +4965,7 @@ public class Tokenizer implements Locator { * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. @@ -4927,7 +4981,7 @@ public class Tokenizer implements Locator { * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. @@ -4998,7 +5052,7 @@ public class Tokenizer implements Locator { * token's public identifier to the empty string * (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. @@ -5012,7 +5066,7 @@ public class Tokenizer implements Locator { * public identifier to the empty string (not * missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. @@ -5163,7 +5217,7 @@ public class Tokenizer implements Locator { * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. @@ -5179,7 +5233,7 @@ public class Tokenizer implements Locator { * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. @@ -5244,7 +5298,7 @@ public class Tokenizer implements Locator { * token's system identifier to the empty string * (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. @@ -5258,7 +5312,7 @@ public class Tokenizer implements Locator { * system identifier to the empty string (not * missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. @@ -5507,7 +5561,7 @@ public class Tokenizer implements Locator { * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. @@ -5523,7 +5577,7 @@ public class Tokenizer implements Locator { * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. @@ -5594,7 +5648,7 @@ public class Tokenizer implements Locator { * token's system identifier to the empty string * (not missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. @@ -5607,7 +5661,7 @@ public class Tokenizer implements Locator { * system identifier to the empty string (not * missing), */ - clearStrBuf(); + clearStrBufBeforeUse(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. @@ -5811,18 +5865,21 @@ public class Tokenizer implements Locator { returnStateSave = returnState; return pos; } - + // HOTSPOT WORKAROUND INSERTION POINT - + // [NOCPP[ - + protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { return to; } // ]NOCPP] - + private void initDoctypeFields() { + // Discard the characters "DOCTYPE" accumulated as a potential bogus + // comment into strBuf. + clearStrBufAfterUse(); doctypeName = ""; if (systemIdentifier != null) { Portability.releaseString(systemIdentifier); @@ -6110,7 +6167,6 @@ public class Tokenizer implements Locator { break eofloop; case MARKUP_DECLARATION_OPEN: errBogusComment(); - clearStrBuf(); emitComment(0, 0); break eofloop; case MARKUP_DECLARATION_HYPHEN: @@ -6321,7 +6377,7 @@ public class Tokenizer implements Locator { * This section defines how to consume an entity. This * definition is used when parsing entities in text and in * attributes. - * + * * The behavior depends on the identity of the next * character (the one immediately after the U+0026 AMPERSAND * character): @@ -6466,6 +6522,7 @@ public class Tokenizer implements Locator { charRefBufLen - charRefBufMark); } } + charRefBufLen = 0; state = returnState; continue eofloop; /* @@ -6484,7 +6541,7 @@ public class Tokenizer implements Locator { * characters (and unconsume the U+0023 NUMBER SIGN * character and, if appropriate, the X character). This is * a parse error; nothing is returned. - * + * * Otherwise, if the next character is a U+003B SEMICOLON, * consume that too. If it isn't, there is a parse error. */ @@ -6602,14 +6659,14 @@ public class Tokenizer implements Locator { } // [NOCPP[ - + public void becomeConfident() { confident = true; } /** * Returns the nextCharOnNewLine. - * + * * @return the nextCharOnNewLine */ public boolean isNextCharOnNewLine() { @@ -6622,7 +6679,7 @@ public class Tokenizer implements Locator { /** * Returns the line. - * + * * @return the line */ public int getLine() { @@ -6631,7 +6688,7 @@ public class Tokenizer implements Locator { /** * Returns the col. - * + * * @return the col */ public int getCol() { @@ -6639,13 +6696,13 @@ public class Tokenizer implements Locator { } // ]NOCPP] - + public boolean isInDataState() { return (stateSave == DATA); } public void resetToDataState() { - strBufLen = 0; + clearStrBufAfterUse(); charRefBufLen = 0; stateSave = Tokenizer.DATA; // line = 1; XXX line numbers @@ -6978,7 +7035,7 @@ public class Tokenizer implements Locator { /** * Sets the encodingDeclarationHandler. - * + * * @param encodingDeclarationHandler * the encodingDeclarationHandler to set */ @@ -6986,25 +7043,25 @@ public class Tokenizer implements Locator { EncodingDeclarationHandler encodingDeclarationHandler) { this.encodingDeclarationHandler = encodingDeclarationHandler; } - + void destructor() { // The translator will write refcount tracing stuff here Portability.delete(attributes); attributes = null; } - + // [NOCPP[ - + /** - * Sets an offset to be added to the position reported to + * Sets an offset to be added to the position reported to * TransitionHandler. - * + * * @param offset the offset */ public void setTransitionBaseOffset(int offset) { - + } - + // ]NOCPP] } diff --git a/parser/html/nsHtml5Tokenizer.cpp b/parser/html/nsHtml5Tokenizer.cpp index 884085ecef09..2838d74aa5c9 100644 --- a/parser/html/nsHtml5Tokenizer.cpp +++ b/parser/html/nsHtml5Tokenizer.cpp @@ -1,25 +1,25 @@ /* * Copyright (c) 2005-2007 Henri Sivonen * Copyright (c) 2007-2015 Mozilla Foundation - * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla + * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla * Foundation, and Opera Software ASA. * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in + * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ @@ -217,6 +217,7 @@ nsHtml5Tokenizer::emitOrAppendCharRefBuf(int32_t returnState) } else { if (charRefBufLen > 0) { tokenHandler->characters(charRefBuf, 0, charRefBufLen); + charRefBufLen = 0; } } } @@ -224,13 +225,16 @@ nsHtml5Tokenizer::emitOrAppendCharRefBuf(int32_t returnState) nsString* nsHtml5Tokenizer::strBufToString() { - return nsHtml5Portability::newStringFromBuffer(strBuf, 0, strBufLen, tokenHandler); + nsString* str = nsHtml5Portability::newStringFromBuffer(strBuf, 0, strBufLen, tokenHandler); + clearStrBufAfterUse(); + return str; } void nsHtml5Tokenizer::strBufToDoctypeName() { doctypeName = nsHtml5Portability::newLocalNameFromBuffer(strBuf, 0, strBufLen, interner); + clearStrBufAfterUse(); } void @@ -238,6 +242,7 @@ nsHtml5Tokenizer::emitStrBuf() { if (strBufLen > 0) { tokenHandler->characters(strBuf, 0, strBufLen); + clearStrBufAfterUse(); } } @@ -259,6 +264,7 @@ void nsHtml5Tokenizer::emitComment(int32_t provisionalHyphens, int32_t pos) { tokenHandler->comment(strBuf, 0, strBufLen - provisionalHyphens); + clearStrBufAfterUse(); cstart = pos + 1; } @@ -275,6 +281,7 @@ void nsHtml5Tokenizer::strBufToElementNameString() { tagName = nsHtml5ElementName::elementNameByBuffer(strBuf, 0, strBufLen, interner); + clearStrBufAfterUse(); } int32_t @@ -316,6 +323,7 @@ void nsHtml5Tokenizer::attributeNameComplete() { attributeName = nsHtml5AttributeName::nameByBuffer(strBuf, 0, strBufLen, interner); + clearStrBufAfterUse(); if (!attributes) { attributes = new nsHtml5HtmlAttributes(0); } @@ -333,6 +341,8 @@ nsHtml5Tokenizer::addAttributeWithoutValue() if (attributeName) { attributes->addAttribute(attributeName, nsHtml5Portability::newEmptyString(), attributeLine); attributeName = nullptr; + } else { + clearStrBufAfterUse(); } } @@ -346,6 +356,8 @@ nsHtml5Tokenizer::addAttributeWithValue() } attributes->addAttribute(attributeName, val, attributeLine); attributeName = nullptr; + } else { + clearStrBufAfterUse(); } } @@ -427,7 +439,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu switch(c) { case '&': { flushChars(buf, pos); - clearCharRefBufAndAppend(c); + MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!"); + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('\0'); returnState = state; state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -464,12 +477,14 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu c = checkChar(buf, pos); if (c >= 'A' && c <= 'Z') { endTag = false; - clearStrBufAndAppend((char16_t) (c + 0x20)); + clearStrBufBeforeUse(); + appendStrBuf((char16_t) (c + 0x20)); state = P::transition(mViewSource, NS_HTML5TOKENIZER_TAG_NAME, reconsume, pos); NS_HTML5_BREAK(tagopenloop); } else if (c >= 'a' && c <= 'z') { endTag = false; - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_TAG_NAME, reconsume, pos); NS_HTML5_BREAK(tagopenloop); } @@ -490,7 +505,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errProcessingInstruction(); } - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -617,7 +633,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu c += 0x20; } attributeLine = line; - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_NAME, reconsume, pos); NS_HTML5_BREAK(beforeattributenameloop); } @@ -710,13 +727,13 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu } case '\"': { attributeLine = line; - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); NS_HTML5_BREAK(beforeattributevalueloop); } case '&': { attributeLine = line; - clearStrBuf(); + clearStrBufBeforeUse(); reconsume = true; state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); @@ -724,7 +741,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu } case '\'': { attributeLine = line; - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -751,7 +768,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu } default: { attributeLine = line; - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); @@ -777,7 +795,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu NS_HTML5_BREAK(attributevaluedoublequotedloop); } case '&': { - clearCharRefBufAndAppend(c); + MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!"); + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('\"'); returnState = state; state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -897,7 +916,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu NS_HTML5_CONTINUE(stateloop); } case '&': { - clearCharRefBufAndAppend(c); + MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!"); + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('>'); returnState = state; state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -982,7 +1002,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (c >= 'A' && c <= 'Z') { c += 0x20; } - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_NAME, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -997,20 +1018,23 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu c = checkChar(buf, pos); switch(c) { case '-': { - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_MARKUP_DECLARATION_HYPHEN, reconsume, pos); NS_HTML5_BREAK(markupdeclarationopenloop); } case 'd': case 'D': { - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); index = 0; state = P::transition(mViewSource, NS_HTML5TOKENIZER_MARKUP_DECLARATION_OCTYPE, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } case '[': { if (tokenHandler->cdataSectionAllowed()) { - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); index = 0; state = P::transition(mViewSource, NS_HTML5TOKENIZER_CDATA_START, reconsume, pos); NS_HTML5_CONTINUE(stateloop); @@ -1020,7 +1044,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errBogusComment(); } - clearStrBuf(); + clearStrBufBeforeUse(); reconsume = true; state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos); NS_HTML5_CONTINUE(stateloop); @@ -1040,7 +1064,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu NS_HTML5_BREAK(stateloop); } case '-': { - clearStrBuf(); + clearStrBufAfterOneHyphen(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_COMMENT_START, reconsume, pos); NS_HTML5_BREAK(markupdeclarationhyphenloop); } @@ -1304,6 +1328,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu index++; continue; } else { + clearStrBufAfterUse(); cstart = pos; reconsume = true; state = P::transition(mViewSource, NS_HTML5TOKENIZER_CDATA_SECTION, reconsume, pos); @@ -1411,7 +1436,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu NS_HTML5_CONTINUE(stateloop); } case '&': { - clearCharRefBufAndAppend(c); + MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!"); + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('\''); returnState = state; state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -1643,6 +1669,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu } } bool earlyBreak = (c == ';' && charRefBufMark == charRefBufLen); + charRefBufLen = 0; if (!(returnState & NS_HTML5TOKENIZER_DATA_AND_RCDATA_MASK)) { cstart = earlyBreak ? pos + 1 : pos; } @@ -1736,6 +1763,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu decimalloop_end: ; } case NS_HTML5TOKENIZER_HANDLE_NCR_VALUE: { + charRefBufLen = 0; handleNcrValue(returnState); state = P::transition(mViewSource, returnState, reconsume, pos); NS_HTML5_CONTINUE(stateloop); @@ -1861,7 +1889,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errGarbageAfterLtSlash(); } - clearStrBufAndAppend('\n'); + clearStrBufBeforeUse(); + appendStrBuf('\n'); state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos); NS_HTML5_BREAK(stateloop); } @@ -1870,7 +1899,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errGarbageAfterLtSlash(); } - clearStrBufAndAppend('\n'); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -1883,14 +1913,16 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu } if (c >= 'a' && c <= 'z') { endTag = true; - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_TAG_NAME, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } else { if (P::reportErrors) { errGarbageAfterLtSlash(); } - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -1910,7 +1942,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu switch(c) { case '&': { flushChars(buf, pos); - clearCharRefBufAndAppend(c); + MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!"); + appendCharRefBuf(c); setAdditionalAndRememberAmpersandLocation('\0'); returnState = state; state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos); @@ -1984,7 +2017,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu switch(c) { case '/': { index = 0; - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME, reconsume, pos); NS_HTML5_BREAK(rawtextrcdatalessthansignloop); } @@ -2028,6 +2061,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu switch(c) { case '\r': { silentCarriageReturn(); + clearStrBufAfterUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_BEFORE_ATTRIBUTE_NAME, reconsume, pos); NS_HTML5_BREAK(stateloop); } @@ -2037,14 +2071,17 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu case ' ': case '\t': case '\f': { + clearStrBufAfterUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_BEFORE_ATTRIBUTE_NAME, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } case '/': { + clearStrBufAfterUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_SELF_CLOSING_START_TAG, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } case '>': { + clearStrBufAfterUse(); state = P::transition(mViewSource, emitCurrentTagToken(false, pos), reconsume, pos); if (shouldSuspend) { NS_HTML5_BREAK(stateloop); @@ -2188,7 +2225,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu switch(c) { case '/': { index = 0; - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -2373,7 +2410,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu switch(c) { case '/': { index = 0; - clearStrBuf(); + clearStrBufBeforeUse(); returnState = NS_HTML5TOKENIZER_SCRIPT_DATA_ESCAPED; state = P::transition(mViewSource, NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME, reconsume, pos); NS_HTML5_CONTINUE(stateloop); @@ -2729,7 +2766,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (c >= 'A' && c <= 'Z') { c += 0x20; } - clearStrBufAndAppend(c); + clearStrBufBeforeUse(); + appendStrBuf(c); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_NAME, reconsume, pos); NS_HTML5_BREAK(beforedoctypenameloop); } @@ -2881,7 +2919,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errNoSpaceBetweenDoctypePublicKeywordAndQuote(); } - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -2889,7 +2927,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errNoSpaceBetweenDoctypePublicKeywordAndQuote(); } - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -2931,12 +2969,12 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu continue; } case '\"': { - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); NS_HTML5_BREAK(beforedoctypepublicidentifierloop); } case '\'': { - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -3029,7 +3067,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errNoSpaceBetweenPublicAndSystemIds(); } - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -3037,7 +3075,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errNoSpaceBetweenPublicAndSystemIds(); } - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -3075,12 +3113,12 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu NS_HTML5_CONTINUE(stateloop); } case '\"': { - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); NS_HTML5_BREAK(betweendoctypepublicandsystemidentifiersloop); } case '\'': { - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -3252,7 +3290,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); } - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -3260,7 +3298,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu if (P::reportErrors) { errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); } - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } @@ -3302,12 +3340,12 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu continue; } case '\"': { - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); NS_HTML5_CONTINUE(stateloop); } case '\'': { - clearStrBuf(); + clearStrBufBeforeUse(); state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); NS_HTML5_BREAK(beforedoctypesystemidentifierloop); } @@ -3455,6 +3493,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu void nsHtml5Tokenizer::initDoctypeFields() { + clearStrBufAfterUse(); doctypeName = nsHtml5Atoms::emptystring; if (systemIdentifier) { nsHtml5Portability::releaseString(systemIdentifier); @@ -3607,7 +3646,6 @@ nsHtml5Tokenizer::eof() } case NS_HTML5TOKENIZER_MARKUP_DECLARATION_OPEN: { errBogusComment(); - clearStrBuf(); emitComment(0, 0); NS_HTML5_BREAK(eofloop); } @@ -3813,6 +3851,7 @@ nsHtml5Tokenizer::eof() tokenHandler->characters(charRefBuf, charRefBufMark, charRefBufLen - charRefBufMark); } } + charRefBufLen = 0; state = returnState; NS_HTML5_CONTINUE(eofloop); } @@ -3935,7 +3974,7 @@ nsHtml5Tokenizer::isInDataState() void nsHtml5Tokenizer::resetToDataState() { - strBufLen = 0; + clearStrBufAfterUse(); charRefBufLen = 0; stateSave = NS_HTML5TOKENIZER_DATA; lastCR = false; diff --git a/parser/html/nsHtml5Tokenizer.h b/parser/html/nsHtml5Tokenizer.h index 16f70889f5ea..da509b69b439 100644 --- a/parser/html/nsHtml5Tokenizer.h +++ b/parser/html/nsHtml5Tokenizer.h @@ -1,25 +1,25 @@ /* * Copyright (c) 2005-2007 Henri Sivonen * Copyright (c) 2007-2015 Mozilla Foundation - * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla + * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla * Foundation, and Opera Software ASA. * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in + * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ @@ -162,21 +162,22 @@ class nsHtml5Tokenizer charRefBuf[charRefBufLen++] = c; } - inline void clearCharRefBufAndAppend(char16_t c) - { - charRefBuf[0] = c; - charRefBufLen = 1; - } - void emitOrAppendCharRefBuf(int32_t returnState); - inline void clearStrBufAndAppend(char16_t c) + inline void clearStrBufAfterUse() { - strBuf[0] = c; - strBufLen = 1; + strBufLen = 0; } - inline void clearStrBuf() + inline void clearStrBufBeforeUse() { + MOZ_ASSERT(!strBufLen, "strBufLen not reset after previous use!"); + strBufLen = 0; + } + + inline void clearStrBufAfterOneHyphen() + { + MOZ_ASSERT(strBufLen == 1, "strBufLen length not one!"); + MOZ_ASSERT(strBuf[0] == '-', "strBuf does not start with a hyphen!"); strBufLen = 0; } @@ -211,6 +212,7 @@ class nsHtml5Tokenizer inline void appendCharRefBufToStrBuf() { appendStrBuf(charRefBuf, 0, charRefBufLen); + charRefBufLen = 0; } void emitComment(int32_t provisionalHyphens, int32_t pos);