From e5ac89f389d9592504691066455f3e3426742cd2 Mon Sep 17 00:00:00 2001
From: Henri Sivonen <hsivonen@hsivonen.fi>
Date: Wed, 19 Oct 2016 10:38:20 +0300
Subject: [PATCH] Bug 1309195 - Mark strBuf as empty after the contents have
 been used or ignored. r=wchen.

MozReview-Commit-ID: 5KU62o2IMPs
---
 parser/html/javasrc/Tokenizer.java | 413 ++++++++++++++++-------------
 parser/html/nsHtml5Tokenizer.cpp   | 149 +++++++----
 parser/html/nsHtml5Tokenizer.h     |  48 ++--
 3 files changed, 354 insertions(+), 256 deletions(-)

diff --git a/parser/html/javasrc/Tokenizer.java b/parser/html/javasrc/Tokenizer.java
index 9754196b068a..d9eaafeb3e14 100644
--- a/parser/html/javasrc/Tokenizer.java
+++ b/parser/html/javasrc/Tokenizer.java
@@ -1,40 +1,45 @@
 /*
  * Copyright (c) 2005-2007 Henri Sivonen
  * Copyright (c) 2007-2015 Mozilla Foundation
- * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 
+ * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
  * Foundation, and Opera Software ASA.
  *
- * Permission is hereby granted, free of charge, to any person obtaining a 
- * copy of this software and associated documentation files (the "Software"), 
- * to deal in the Software without restriction, including without limitation 
- * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
- * and/or sell copies of the Software, and to permit persons to whom the 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in 
+ * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
 
 /*
- * The comments following this one that use the same comment syntax as this 
- * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 
+ * The comments following this one that use the same comment syntax as this
+ * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
  * amended as of June 18 2008 and May 31 2010.
  * That document came with this statement:
- * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 
- * Opera Software ASA. You are granted a license to use, reproduce and 
+ * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
+ * Opera Software ASA. You are granted a license to use, reproduce and
  * create derivative works of this document."
  */
 
 package nu.validator.htmlparser.impl;
 
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
 import nu.validator.htmlparser.annotation.Auto;
 import nu.validator.htmlparser.annotation.CharacterName;
 import nu.validator.htmlparser.annotation.Const;
@@ -46,23 +51,18 @@ import nu.validator.htmlparser.common.Interner;
 import nu.validator.htmlparser.common.TokenHandler;
 import nu.validator.htmlparser.common.XmlViolationPolicy;
 
-import org.xml.sax.ErrorHandler;
-import org.xml.sax.Locator;
-import org.xml.sax.SAXException;
-import org.xml.sax.SAXParseException;
-
 /**
  * An implementation of
  * https://html.spec.whatwg.org/multipage/syntax.html#tokenization
- * 
+ *
  * This class implements the <code>Locator</code> interface. This is not an
  * incidental implementation detail: Users of this class are encouraged to make
  * use of the <code>Locator</code> nature.
- * 
+ *
  * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
  * can be configured to treat these conditions as fatal or to coerce the infoset
  * to something that XML 1.0 allows.
- * 
+ *
  * @version $Id$
  * @author hsivonen
  */
@@ -529,12 +529,12 @@ public class Tokenizer implements Locator {
 
     /**
      * The constructor.
-     * 
+     *
      * @param tokenHandler
      *            the handler for receiving tokens
      */
     public Tokenizer(TokenHandler tokenHandler
-    // CPPONLY: , boolean viewingXmlSource        
+    // CPPONLY: , boolean viewingXmlSource
     ) {
         this.tokenHandler = tokenHandler;
         this.encodingDeclarationHandler = null;
@@ -577,7 +577,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Returns the mappingLangToXmlLang.
-     * 
+     *
      * @return the mappingLangToXmlLang
      */
     public boolean isMappingLangToXmlLang() {
@@ -586,7 +586,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Sets the mappingLangToXmlLang.
-     * 
+     *
      * @param mappingLangToXmlLang
      *            the mappingLangToXmlLang to set
      */
@@ -597,7 +597,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Sets the error handler.
-     * 
+     *
      * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
      */
     public void setErrorHandler(ErrorHandler eh) {
@@ -610,7 +610,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Sets the commentPolicy.
-     * 
+     *
      * @param commentPolicy
      *            the commentPolicy to set
      */
@@ -620,7 +620,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Sets the contentNonXmlCharPolicy.
-     * 
+     *
      * @param contentNonXmlCharPolicy
      *            the contentNonXmlCharPolicy to set
      */
@@ -634,7 +634,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Sets the contentSpacePolicy.
-     * 
+     *
      * @param contentSpacePolicy
      *            the contentSpacePolicy to set
      */
@@ -644,7 +644,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Sets the xmlnsPolicy.
-     * 
+     *
      * @param xmlnsPolicy
      *            the xmlnsPolicy to set
      */
@@ -661,7 +661,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Sets the html4ModeCompatibleWithXhtml1Schemata.
-     * 
+     *
      * @param html4ModeCompatibleWithXhtml1Schemata
      *            the html4ModeCompatibleWithXhtml1Schemata to set
      */
@@ -674,10 +674,10 @@ public class Tokenizer implements Locator {
 
     // For the token handler to call
     /**
-     * Sets the tokenizer state and the associated element name. This should 
+     * Sets the tokenizer state and the associated element name. This should
      * only ever used to put the tokenizer into one of the states that have
      * a special end tag expectation.
-     * 
+     *
      * @param specialTokenizerState
      *            the tokenizer state to set
      * @param endTagExpectation
@@ -696,10 +696,10 @@ public class Tokenizer implements Locator {
     }
 
     /**
-     * Sets the tokenizer state and the associated element name. This should 
+     * Sets the tokenizer state and the associated element name. This should
      * only ever used to put the tokenizer into one of the states that have
      * a special end tag expectation.
-     * 
+     *
      * @param specialTokenizerState
      *            the tokenizer state to set
      * @param endTagExpectation
@@ -822,33 +822,35 @@ public class Tokenizer implements Locator {
         charRefBuf[charRefBufLen++] = c;
     }
 
-    @Inline private void clearCharRefBufAndAppend(char c) {
-        charRefBuf[0] = c;
-        charRefBufLen = 1;
-    }
-
     private void emitOrAppendCharRefBuf(int returnState) throws SAXException {
         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
             appendCharRefBufToStrBuf();
         } else {
             if (charRefBufLen > 0) {
                 tokenHandler.characters(charRefBuf, 0, charRefBufLen);
+                charRefBufLen = 0;
             }
         }
     }
 
-    @Inline private void clearStrBufAndAppend(char c) {
-        strBuf[0] = c;
-        strBufLen = 1;
+    @Inline private void clearStrBufAfterUse() {
+        strBufLen = 0;
     }
 
-    @Inline private void clearStrBuf() {
+    @Inline private void clearStrBufBeforeUse() {
+        assert strBufLen == 0: "strBufLen not reset after previous use!";
+        strBufLen = 0; // no-op in the absence of bugs
+    }
+
+    @Inline private void clearStrBufAfterOneHyphen() {
+        assert strBufLen == 1: "strBufLen length not one!";
+        assert strBuf[0] == '-': "strBuf does not start with a hyphen!";
         strBufLen = 0;
     }
 
     /**
      * Appends to the buffer.
-     * 
+     *
      * @param c
      *            the UTF-16 code unit to append
      */
@@ -864,38 +866,42 @@ public class Tokenizer implements Locator {
 
     /**
      * The buffer as a String. Currently only used for error reporting.
-     * 
+     *
      * <p>
      * C++ memory note: The return value must be released.
-     * 
+     *
      * @return the buffer as a string
      */
     protected String strBufToString() {
-        return Portability.newStringFromBuffer(strBuf, 0, strBufLen
+        String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen
             // CPPONLY: , tokenHandler
         );
+        clearStrBufAfterUse();
+        return str;
     }
 
     /**
      * Returns the buffer as a local name. The return value is released in
      * emitDoctypeToken().
-     * 
+     *
      * @return the buffer as local name
      */
     private void strBufToDoctypeName() {
         doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
                 interner);
+        clearStrBufAfterUse();
     }
 
     /**
      * Emits the buffer as character tokens.
-     * 
+     *
      * @throws SAXException
      *             if the token handler threw
      */
     private void emitStrBuf() throws SAXException {
         if (strBufLen > 0) {
             tokenHandler.characters(strBuf, 0, strBufLen);
+            clearStrBufAfterUse();
         }
     }
 
@@ -942,6 +948,8 @@ public class Tokenizer implements Locator {
         switch (commentPolicy) {
             case ALTER_INFOSET:
                 strBufLen--;
+                // WARNING!!! This expands the worst case of the buffer length
+                // given the length of input!
                 appendStrBuf(' ');
                 appendStrBuf('-');
                 // FALLTHROUGH
@@ -975,14 +983,15 @@ public class Tokenizer implements Locator {
      */
     @Inline private void appendCharRefBufToStrBuf() {
         appendStrBuf(charRefBuf, 0, charRefBufLen);
+        charRefBufLen = 0;
     }
 
     /**
      * Emits the current comment token.
-     * 
+     *
      * @param pos
      *            TODO
-     * 
+     *
      * @throws SAXException
      */
     private void emitComment(int provisionalHyphens, int pos)
@@ -995,17 +1004,18 @@ public class Tokenizer implements Locator {
             // [NOCPP[
         }
         // ]NOCPP]
+        clearStrBufAfterUse();
         cstart = pos + 1;
     }
 
     /**
      * Flushes coalesced character tokens.
-     * 
+     *
      * @param buf
      *            TODO
      * @param pos
      *            TODO
-     * 
+     *
      * @throws SAXException
      */
     protected void flushChars(@NoLength char[] buf, int pos)
@@ -1019,7 +1029,7 @@ public class Tokenizer implements Locator {
     /**
      * Reports an condition that would make the infoset incompatible with XML
      * 1.0 as fatal.
-     * 
+     *
      * @param message
      *            the message
      * @throws SAXException
@@ -1035,7 +1045,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Reports a Parse Error.
-     * 
+     *
      * @param message
      *            the message
      * @throws SAXException
@@ -1066,7 +1076,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Reports a warning
-     * 
+     *
      * @param message
      *            the message
      * @throws SAXException
@@ -1082,6 +1092,7 @@ public class Tokenizer implements Locator {
     private void strBufToElementNameString() {
         tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
                 interner);
+        clearStrBufAfterUse();
     }
 
     private int emitCurrentTagToken(boolean selfClosing, int pos)
@@ -1133,6 +1144,7 @@ public class Tokenizer implements Locator {
                 , namePolicy != XmlViolationPolicy.ALLOW
                 // ]NOCPP]
                 , interner);
+        clearStrBufAfterUse();
 
         if (attributes == null) {
             attributes = new HtmlAttributes(mappingLangToXmlLang);
@@ -1199,6 +1211,8 @@ public class Tokenizer implements Locator {
             // ]NOCPP]
             attributeName = null; // attributeName has been adopted by the
             // |attributes| object
+        } else {
+            clearStrBufAfterUse();
         }
     }
 
@@ -1229,6 +1243,9 @@ public class Tokenizer implements Locator {
             );
             attributeName = null; // attributeName has been adopted by the
             // |attributes| object
+        } else {
+            // We have a duplicate attribute. Explicitly discard its value.
+            clearStrBufAfterUse();
         }
     }
 
@@ -1254,7 +1271,7 @@ public class Tokenizer implements Locator {
     }
 
     // ]NOCPP]
-    
+
     public void start() throws SAXException {
         initializeWithoutStarting();
         tokenHandler.startTokenization(this);
@@ -1347,6 +1364,14 @@ public class Tokenizer implements Locator {
         // unifying the tokenizer and tree builder buffers in the future.
         int worstCase = strBufLen + inputLength + charRefBufLen + 2;
         tokenHandler.ensureBufferSpace(worstCase);
+        if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) {
+            // When altering infoset, if the comment contents are consecutive
+            // hyphens, each hyphen generates a space, too. These buffer
+            // contents never get emitted as characters() to the tokenHandler,
+            // which is why this calculation happens after the call to
+            // ensureBufferSpace on tokenHandler.
+            worstCase *= 2;
+        }
         if (strBuf == null) {
             // Add an arbitrary small value to avoid immediate reallocation
             // once there are a few characters in the buffer.
@@ -1370,66 +1395,66 @@ public class Tokenizer implements Locator {
             int endPos) throws SAXException {
         /*
          * Idioms used in this code:
-         * 
-         * 
+         *
+         *
          * Consuming the next input character
-         * 
+         *
          * To consume the next input character, the code does this: if (++pos ==
          * endPos) { break stateloop; } c = checkChar(buf, pos);
-         * 
-         * 
+         *
+         *
          * Staying in a state
-         * 
+         *
          * When there's a state that the tokenizer may stay in over multiple
          * input characters, the state has a wrapper |for(;;)| loop and staying
          * in the state continues the loop.
-         * 
-         * 
+         *
+         *
          * Switching to another state
-         * 
+         *
          * To switch to another state, the code sets the state variable to the
          * magic number of the new state. Then it either continues stateloop or
          * breaks out of the state's own wrapper loop if the target state is
          * right after the current state in source order. (This is a partial
          * workaround for Java's lack of goto.)
-         * 
-         * 
+         *
+         *
          * Reconsume support
-         * 
+         *
          * The spec sometimes says that an input character is reconsumed in
          * another state. If a state can ever be entered so that an input
          * character can be reconsumed in it, the state's code starts with an
          * |if (reconsume)| that sets reconsume to false and skips over the
          * normal code for consuming a new character.
-         * 
+         *
          * To reconsume the current character in another state, the code sets
          * |reconsume| to true and then switches to the other state.
-         * 
-         * 
+         *
+         *
          * Emitting character tokens
-         * 
+         *
          * This method emits character tokens lazily. Whenever a new range of
          * character tokens starts, the field cstart must be set to the start
          * index of the range. The flushChars() method must be called at the end
          * of a range to flush it.
-         * 
-         * 
+         *
+         *
          * U+0000 handling
-         * 
+         *
          * The various states have to handle the replacement of U+0000 with
          * U+FFFD. However, if U+0000 would be reconsumed in another state, the
          * replacement doesn't need to happen, because it's handled by the
          * reconsuming state.
-         * 
-         * 
+         *
+         *
          * LF handling
-         * 
+         *
          * Every state needs to increment the line number upon LF unless the LF
          * gets reconsumed by another state which increments the line number.
-         * 
-         * 
+         *
+         *
          * CR handling
-         * 
+         *
          * Every state needs to handle CR unless the CR gets reconsumed and is
          * handled by the reconsuming state. The CR needs to be handled as if it
          * were and LF, the lastCR field must be set to true and then this
@@ -1455,7 +1480,8 @@ public class Tokenizer implements Locator {
                                  * reference in data state.
                                  */
                                 flushChars(buf, pos);
-                                clearCharRefBufAndAppend(c);
+                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+                                appendCharRefBuf(c);
                                 setAdditionalAndRememberAmpersandLocation('\u0000');
                                 returnState = state;
                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -1482,7 +1508,7 @@ public class Tokenizer implements Locator {
                                 /*
                                  * Anything else Emit the input character as a
                                  * character token.
-                                 * 
+                                 *
                                  * Stay in the data state.
                                  */
                                 continue;
@@ -1515,7 +1541,8 @@ public class Tokenizer implements Locator {
                              * input character (add 0x0020 to the character's
                              * code point),
                              */
-                            clearStrBufAndAppend((char) (c + 0x20));
+                            clearStrBufBeforeUse();
+                            appendStrBuf((char) (c + 0x20));
                             /* then switch to the tag name state. */
                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
                             /*
@@ -1534,7 +1561,8 @@ public class Tokenizer implements Locator {
                             /*
                              * set its tag name to the input character,
                              */
-                            clearStrBufAndAppend(c);
+                            clearStrBufBeforeUse();
+                            appendStrBuf(c);
                             /* then switch to the tag name state. */
                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
                             /*
@@ -1574,7 +1602,8 @@ public class Tokenizer implements Locator {
                                 /*
                                  * Switch to the bogus comment state.
                                  */
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
                                 continue stateloop;
                             case '>':
@@ -1777,7 +1806,8 @@ public class Tokenizer implements Locator {
                                  * Set that attribute's name to the current
                                  * input character,
                                  */
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 /*
                                  * and its value to the empty string.
                                  */
@@ -1923,7 +1953,7 @@ public class Tokenizer implements Locator {
                                  * attribute value (double-quoted) state.
                                  */
                                 // CPPONLY: attributeLine = line;
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
                                 break beforeattributevalueloop;
                             // continue stateloop;
@@ -1934,7 +1964,7 @@ public class Tokenizer implements Locator {
                                  * input character.
                                  */
                                 // CPPONLY: attributeLine = line;
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 reconsume = true;
                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
                                 noteUnquotedAttributeValue();
@@ -1945,7 +1975,7 @@ public class Tokenizer implements Locator {
                                  * value (single-quoted) state.
                                  */
                                 // CPPONLY: attributeLine = line;
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
                                 continue stateloop;
                             case '>':
@@ -1989,7 +2019,8 @@ public class Tokenizer implements Locator {
                                  * character to the current attribute's value.
                                  */
                                 // CPPONLY: attributeLine = line;
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 /*
                                  * Switch to the attribute value (unquoted)
                                  * state.
@@ -2032,7 +2063,8 @@ public class Tokenizer implements Locator {
                                  * additional allowed character being U+0022
                                  * QUOTATION MARK (").
                                  */
-                                clearCharRefBufAndAppend(c);
+                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+                                appendCharRefBuf(c);
                                 setAdditionalAndRememberAmpersandLocation('\"');
                                 returnState = state;
                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -2201,7 +2233,8 @@ public class Tokenizer implements Locator {
                                  * additional allowed character being U+003E
                                  * GREATER-THAN SIGN (>)
                                  */
-                                clearCharRefBufAndAppend(c);
+                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+                                appendCharRefBuf(c);
                                 setAdditionalAndRememberAmpersandLocation('>');
                                 returnState = state;
                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -2340,7 +2373,8 @@ public class Tokenizer implements Locator {
                                  * Set that attribute's name to the current
                                  * input character,
                                  */
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 /*
                                  * and its value to the empty string.
                                  */
@@ -2364,12 +2398,12 @@ public class Tokenizer implements Locator {
                          * HYPHEN-MINUS characters (-), consume those two
                          * characters, create a comment token whose data is the
                          * empty string, and switch to the comment start state.
-                         * 
+                         *
                          * Otherwise, if the next seven characters are an ASCII
                          * case-insensitive match for the word "DOCTYPE", then
                          * consume those characters and switch to the DOCTYPE
                          * state.
-                         * 
+                         *
                          * Otherwise, if the insertion mode is
                          * "in foreign content" and the current node is not an
                          * element in the HTML namespace and the next seven
@@ -2378,7 +2412,7 @@ public class Tokenizer implements Locator {
                          * U+005B LEFT SQUARE BRACKET character before and
                          * after), then consume those characters and switch to
                          * the CDATA section state.
-                         * 
+                         *
                          * Otherwise, is is a parse error. Switch to the bogus
                          * comment state. The next character that is consumed,
                          * if any, is the first character that will be in the
@@ -2386,19 +2420,22 @@ public class Tokenizer implements Locator {
                          */
                         switch (c) {
                             case '-':
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
                                 break markupdeclarationopenloop;
                             // continue stateloop;
                             case 'd':
                             case 'D':
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 index = 0;
                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
                                 continue stateloop;
                             case '[':
                                 if (tokenHandler.cdataSectionAllowed()) {
-                                    clearStrBufAndAppend(c);
+                                    clearStrBufBeforeUse();
+                                    appendStrBuf(c);
                                     index = 0;
                                     state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
                                     continue stateloop;
@@ -2406,7 +2443,7 @@ public class Tokenizer implements Locator {
                                 // else fall through
                             default:
                                 errBogusComment();
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 reconsume = true;
                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
                                 continue stateloop;
@@ -2423,7 +2460,7 @@ public class Tokenizer implements Locator {
                             case '\u0000':
                                 break stateloop;
                             case '-':
-                                clearStrBuf();
+                                clearStrBufAfterOneHyphen();
                                 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
                                 break markupdeclarationhyphenloop;
                             // continue stateloop;
@@ -2443,8 +2480,8 @@ public class Tokenizer implements Locator {
                         c = checkChar(buf, pos);
                         /*
                          * Comment start state
-                         * 
-                         * 
+                         *
+                         *
                          * Consume the next input character:
                          */
                         switch (c) {
@@ -2653,7 +2690,7 @@ public class Tokenizer implements Locator {
                         c = checkChar(buf, pos);
                         /*
                          * Comment end bang state
-                         * 
+                         *
                          * Consume the next input character:
                          */
                         switch (c) {
@@ -2713,7 +2750,7 @@ public class Tokenizer implements Locator {
                     c = checkChar(buf, pos);
                     /*
                      * Comment start dash state
-                     * 
+                     *
                      * Consume the next input character:
                      */
                     switch (c) {
@@ -2777,6 +2814,7 @@ public class Tokenizer implements Locator {
                             index++;
                             continue;
                         } else {
+                            clearStrBufAfterUse();
                             cstart = pos; // start coalescing
                             reconsume = true;
                             state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
@@ -2841,11 +2879,11 @@ public class Tokenizer implements Locator {
                         c = checkChar(buf, pos);
                         switch (c) {
                             case ']':
-                                // Saw a third ]. Emit one ] (logically the 
-                                // first one) and stay in this state to 
+                                // Saw a third ]. Emit one ] (logically the
+                                // first one) and stay in this state to
                                 // remember that the last two characters seen
                                 // have been ]].
-                                tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);                                
+                                tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
                                 continue;
                             case '>':
                                 cstart = pos + 1;
@@ -2890,7 +2928,8 @@ public class Tokenizer implements Locator {
                                  * + additional allowed character being U+0027
                                  * APOSTROPHE (').
                                  */
-                                clearCharRefBufAndAppend(c);
+                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+                                appendCharRefBuf(c);
                                 setAdditionalAndRememberAmpersandLocation('\'');
                                 returnState = state;
                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -2939,7 +2978,7 @@ public class Tokenizer implements Locator {
                      * This section defines how to consume a character
                      * reference. This definition is used when parsing character
                      * references in text and in attributes.
-                     * 
+                     *
                      * The behavior depends on the identity of the next
                      * character (the one immediately after the U+0026 AMPERSAND
                      * character):
@@ -3010,7 +3049,7 @@ public class Tokenizer implements Locator {
                         }
                         /*
                          * The data structure is as follows:
-                         * 
+                         *
                          * HILO_ACCEL is a two-dimensional int array whose major
                          * index corresponds to the second character of the
                          * character reference (code point as index) and the
@@ -3021,7 +3060,7 @@ public class Tokenizer implements Locator {
                          * to omit parts of it: The second dimension of the
                          * table is null when no character reference starts with
                          * the character corresponding to that row.
-                         * 
+                         *
                          * The int value HILO_ACCEL (by these indeces) is zero
                          * if there exists no character reference starting with
                          * that two-letter prefix. Otherwise, the value is an
@@ -3033,14 +3072,14 @@ public class Tokenizer implements Locator {
                          * first two character reference names share their
                          * prefix so the packed int cannot be 0 by packing the
                          * two shorts.)
-                         * 
+                         *
                          * NAMES is an array of byte arrays where each byte
                          * array encodes the name of a character references as
                          * ASCII. The names omit the first two letters of the
                          * name. (Since storing the first two letters would be
                          * redundant with the data contained in HILO_ACCEL.) The
                          * entries are lexically sorted.
-                         * 
+                         *
                          * For a given index in NAMES, the same index in VALUES
                          * contains the corresponding expansion as an array of
                          * two UTF-16 code units (either the character and
@@ -3128,18 +3167,18 @@ public class Tokenizer implements Locator {
                         }
 
                         if (c == ';') {
-                            // If we see a semicolon, there cannot be a 
+                            // If we see a semicolon, there cannot be a
                             // longer match. Break the loop. However, before
-                            // breaking, take the longest match so far as the 
-                            // candidate, if we are just about to complete a 
+                            // breaking, take the longest match so far as the
+                            // candidate, if we are just about to complete a
                             // match.
                             if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
                                 candidate = lo;
                                 charRefBufMark = charRefBufLen;
-                            }                            
+                            }
                             break outer;
                         }
-                        
+
                         if (hi < lo) {
                             break outer;
                         }
@@ -3238,13 +3277,16 @@ public class Tokenizer implements Locator {
                                         charRefBufLen - charRefBufMark);
                             }
                         }
+                        // charRefBufLen will be zeroed below!
+
                         // Check if we broke out early with c being the last
                         // character that matched as opposed to being the
-                        // first one that didn't match. In the case of an 
+                        // first one that didn't match. In the case of an
                         // early break, the next run on text should start
-                        // *after* the current character and the current 
+                        // *after* the current character and the current
                         // character shouldn't be reconsumed.
                         boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
+                        charRefBufLen = 0;
                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
                             cstart = earlyBreak ? pos + 1 : pos;
                         }
@@ -3278,14 +3320,14 @@ public class Tokenizer implements Locator {
                             /*
                              * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
                              * LETTER X Consume the X.
-                             * 
+                             *
                              * Follow the steps below, but using the range of
                              * characters U+0030 DIGIT ZERO through to U+0039
                              * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
                              * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
                              * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
                              * LETTER F (in other words, 0-9, A-F, a-f).
-                             * 
+                             *
                              * When it comes to interpreting the number,
                              * interpret it as a hexadecimal number.
                              */
@@ -3297,7 +3339,7 @@ public class Tokenizer implements Locator {
                              * Anything else Follow the steps below, but using
                              * the range of characters U+0030 DIGIT ZERO through
                              * to U+0039 DIGIT NINE (i.e. just 0-9).
-                             * 
+                             *
                              * When it comes to interpreting the number,
                              * interpret it as a decimal number.
                              */
@@ -3354,7 +3396,7 @@ public class Tokenizer implements Locator {
                              * NUMBER SIGN character and, if appropriate, the X
                              * character). This is a parse error; nothing is
                              * returned.
-                             * 
+                             *
                              * Otherwise, if the next character is a U+003B
                              * SEMICOLON, consume that too. If it isn't, there
                              * is a parse error.
@@ -3383,6 +3425,8 @@ public class Tokenizer implements Locator {
                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
                 case HANDLE_NCR_VALUE:
                     // WARNING previous state sets reconsume
+                    // We are not going to emit the contents of charRefBuf.
+                    charRefBufLen = 0;
                     // XXX inline this case if the method size can take it
                     handleNcrValue(returnState);
                     state = transition(state, returnState, reconsume, pos);
@@ -3447,7 +3491,7 @@ public class Tokenizer implements Locator {
                              * NUMBER SIGN character and, if appropriate, the X
                              * character). This is a parse error; nothing is
                              * returned.
-                             * 
+                             *
                              * Otherwise, if the next character is a U+003B
                              * SEMICOLON, consume that too. If it isn't, there
                              * is a parse error.
@@ -3529,7 +3573,8 @@ public class Tokenizer implements Locator {
                             /*
                              * Switch to the bogus comment state.
                              */
-                            clearStrBufAndAppend('\n');
+                            clearStrBufBeforeUse();
+                            appendStrBuf('\n');
                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
                             break stateloop;
                         case '\n':
@@ -3539,7 +3584,8 @@ public class Tokenizer implements Locator {
                             /*
                              * Switch to the bogus comment state.
                              */
-                            clearStrBufAndAppend('\n');
+                            clearStrBufBeforeUse();
+                            appendStrBuf(c);
                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
                             continue stateloop;
                         case '\u0000':
@@ -3559,7 +3605,8 @@ public class Tokenizer implements Locator {
                                 /*
                                  * set its tag name to the input character,
                                  */
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 /*
                                  * then switch to the tag name state. (Don't
                                  * emit the token yet; further details will be
@@ -3573,7 +3620,8 @@ public class Tokenizer implements Locator {
                                 /*
                                  * Switch to the bogus comment state.
                                  */
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
                                 continue stateloop;
                             }
@@ -3596,7 +3644,8 @@ public class Tokenizer implements Locator {
                                  * reference in RCDATA state.
                                  */
                                 flushChars(buf, pos);
-                                clearCharRefBufAndAppend(c);
+                                assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+                                appendCharRefBuf(c);
                                 setAdditionalAndRememberAmpersandLocation('\u0000');
                                 returnState = state;
                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -3681,7 +3730,7 @@ public class Tokenizer implements Locator {
                                  * data end tag open state.
                                  */
                                 index = 0;
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
                                 break rawtextrcdatalessthansignloop;
                             // FALL THRU continue stateloop;
@@ -3710,7 +3759,7 @@ public class Tokenizer implements Locator {
                         c = checkChar(buf, pos);
                         /*
                          * ASSERT! when entering this state, set index to 0 and
-                         * call clearStrBuf() assert (contentModelElement !=
+                         * call clearStrBufBeforeUse() assert (contentModelElement !=
                          * null); Let's implement the above without lookahead.
                          * strBuf is the 'temporary buffer'.
                          */
@@ -3743,6 +3792,7 @@ public class Tokenizer implements Locator {
                             switch (c) {
                                 case '\r':
                                     silentCarriageReturn();
+                                    clearStrBufAfterUse(); // strBuf not used
                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
                                     break stateloop;
                                 case '\n':
@@ -3758,6 +3808,7 @@ public class Tokenizer implements Locator {
                                      * appropriate end tag token, then switch to
                                      * the before attribute name state.
                                      */
+                                    clearStrBufAfterUse(); // strBuf not used
                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
                                     continue stateloop;
                                 case '/':
@@ -3767,6 +3818,7 @@ public class Tokenizer implements Locator {
                                      * then switch to the self-closing start tag
                                      * state.
                                      */
+                                    clearStrBufAfterUse(); // strBuf not used
                                     state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
                                     continue stateloop;
                                 case '>':
@@ -3776,6 +3828,7 @@ public class Tokenizer implements Locator {
                                      * end tag token, then emit the current tag
                                      * token and switch to the data state.
                                      */
+                                    clearStrBufAfterUse(); // strBuf not used
                                     state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
                                     if (shouldSuspend) {
                                         break stateloop;
@@ -3832,9 +3885,9 @@ public class Tokenizer implements Locator {
                          * character (i.e. up to the character just before the
                          * U+003E or EOF character). (If the comment was started
                          * by the end of the file (EOF), the token is empty.)
-                         * 
+                         *
                          * Switch to the data state.
-                         * 
+                         *
                          * If the end of the file was reached, reconsume the EOF
                          * character.
                          */
@@ -3950,7 +4003,7 @@ public class Tokenizer implements Locator {
                                  * data end tag open state.
                                  */
                                 index = 0;
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
                                 continue stateloop;
                             case '!':
@@ -4208,7 +4261,7 @@ public class Tokenizer implements Locator {
                                  * data escaped end tag open state.
                                  */
                                 index = 0;
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
                                 continue stateloop;
@@ -4690,7 +4743,8 @@ public class Tokenizer implements Locator {
                                  * Set the token's name name to the current
                                  * input character.
                                  */
-                                clearStrBufAndAppend(c);
+                                clearStrBufBeforeUse();
+                                appendStrBuf(c);
                                 /*
                                  * Switch to the DOCTYPE name state.
                                  */
@@ -4911,7 +4965,7 @@ public class Tokenizer implements Locator {
                                  * Set the DOCTYPE token's public identifier to
                                  * the empty string (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE public identifier
                                  * (double-quoted) state.
@@ -4927,7 +4981,7 @@ public class Tokenizer implements Locator {
                                  * Set the DOCTYPE token's public identifier to
                                  * the empty string (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE public identifier
                                  * (single-quoted) state.
@@ -4998,7 +5052,7 @@ public class Tokenizer implements Locator {
                                  * token's public identifier to the empty string
                                  * (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE public identifier
                                  * (double-quoted) state.
@@ -5012,7 +5066,7 @@ public class Tokenizer implements Locator {
                                  * public identifier to the empty string (not
                                  * missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE public identifier
                                  * (single-quoted) state.
@@ -5163,7 +5217,7 @@ public class Tokenizer implements Locator {
                                  * Set the DOCTYPE token's system identifier to
                                  * the empty string (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE system identifier
                                  * (double-quoted) state.
@@ -5179,7 +5233,7 @@ public class Tokenizer implements Locator {
                                  * Set the DOCTYPE token's system identifier to
                                  * the empty string (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE system identifier
                                  * (single-quoted) state.
@@ -5244,7 +5298,7 @@ public class Tokenizer implements Locator {
                                  * token's system identifier to the empty string
                                  * (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE system identifier
                                  * (double-quoted) state.
@@ -5258,7 +5312,7 @@ public class Tokenizer implements Locator {
                                  * system identifier to the empty string (not
                                  * missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE system identifier
                                  * (single-quoted) state.
@@ -5507,7 +5561,7 @@ public class Tokenizer implements Locator {
                                  * Set the DOCTYPE token's system identifier to
                                  * the empty string (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE public identifier
                                  * (double-quoted) state.
@@ -5523,7 +5577,7 @@ public class Tokenizer implements Locator {
                                  * Set the DOCTYPE token's public identifier to
                                  * the empty string (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE public identifier
                                  * (single-quoted) state.
@@ -5594,7 +5648,7 @@ public class Tokenizer implements Locator {
                                  * token's system identifier to the empty string
                                  * (not missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE system identifier
                                  * (double-quoted) state.
@@ -5607,7 +5661,7 @@ public class Tokenizer implements Locator {
                                  * system identifier to the empty string (not
                                  * missing),
                                  */
-                                clearStrBuf();
+                                clearStrBufBeforeUse();
                                 /*
                                  * then switch to the DOCTYPE system identifier
                                  * (single-quoted) state.
@@ -5811,18 +5865,21 @@ public class Tokenizer implements Locator {
         returnStateSave = returnState;
         return pos;
     }
-    
+
     // HOTSPOT WORKAROUND INSERTION POINT
-    
+
     // [NOCPP[
-    
+
     protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
         return to;
     }
 
     // ]NOCPP]
-    
+
     private void initDoctypeFields() {
+        // Discard the characters "DOCTYPE" accumulated as a potential bogus
+        // comment into strBuf.
+        clearStrBufAfterUse();
         doctypeName = "";
         if (systemIdentifier != null) {
             Portability.releaseString(systemIdentifier);
@@ -6110,7 +6167,6 @@ public class Tokenizer implements Locator {
                     break eofloop;
                 case MARKUP_DECLARATION_OPEN:
                     errBogusComment();
-                    clearStrBuf();
                     emitComment(0, 0);
                     break eofloop;
                 case MARKUP_DECLARATION_HYPHEN:
@@ -6321,7 +6377,7 @@ public class Tokenizer implements Locator {
                      * This section defines how to consume an entity. This
                      * definition is used when parsing entities in text and in
                      * attributes.
-                     * 
+                     *
                      * The behavior depends on the identity of the next
                      * character (the one immediately after the U+0026 AMPERSAND
                      * character):
@@ -6466,6 +6522,7 @@ public class Tokenizer implements Locator {
                                         charRefBufLen - charRefBufMark);
                             }
                         }
+                        charRefBufLen = 0;
                         state = returnState;
                         continue eofloop;
                         /*
@@ -6484,7 +6541,7 @@ public class Tokenizer implements Locator {
                      * characters (and unconsume the U+0023 NUMBER SIGN
                      * character and, if appropriate, the X character). This is
                      * a parse error; nothing is returned.
-                     * 
+                     *
                      * Otherwise, if the next character is a U+003B SEMICOLON,
                      * consume that too. If it isn't, there is a parse error.
                      */
@@ -6602,14 +6659,14 @@ public class Tokenizer implements Locator {
     }
 
     // [NOCPP[
-    
+
     public void becomeConfident() {
         confident = true;
     }
 
     /**
      * Returns the nextCharOnNewLine.
-     * 
+     *
      * @return the nextCharOnNewLine
      */
     public boolean isNextCharOnNewLine() {
@@ -6622,7 +6679,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Returns the line.
-     * 
+     *
      * @return the line
      */
     public int getLine() {
@@ -6631,7 +6688,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Returns the col.
-     * 
+     *
      * @return the col
      */
     public int getCol() {
@@ -6639,13 +6696,13 @@ public class Tokenizer implements Locator {
     }
 
     // ]NOCPP]
-    
+
     public boolean isInDataState() {
         return (stateSave == DATA);
     }
 
     public void resetToDataState() {
-        strBufLen = 0;
+        clearStrBufAfterUse();
         charRefBufLen = 0;
         stateSave = Tokenizer.DATA;
         // line = 1; XXX line numbers
@@ -6978,7 +7035,7 @@ public class Tokenizer implements Locator {
 
     /**
      * Sets the encodingDeclarationHandler.
-     * 
+     *
      * @param encodingDeclarationHandler
      *            the encodingDeclarationHandler to set
      */
@@ -6986,25 +7043,25 @@ public class Tokenizer implements Locator {
             EncodingDeclarationHandler encodingDeclarationHandler) {
         this.encodingDeclarationHandler = encodingDeclarationHandler;
     }
-    
+
     void destructor() {
         // The translator will write refcount tracing stuff here
         Portability.delete(attributes);
         attributes = null;
     }
-    
+
     // [NOCPP[
-    
+
     /**
-     * Sets an offset to be added to the position reported to 
+     * Sets an offset to be added to the position reported to
      * <code>TransitionHandler</code>.
-     * 
+     *
      * @param offset the offset
      */
     public void setTransitionBaseOffset(int offset) {
-        
+
     }
-    
+
     // ]NOCPP]
 
 }
diff --git a/parser/html/nsHtml5Tokenizer.cpp b/parser/html/nsHtml5Tokenizer.cpp
index 884085ecef09..2838d74aa5c9 100644
--- a/parser/html/nsHtml5Tokenizer.cpp
+++ b/parser/html/nsHtml5Tokenizer.cpp
@@ -1,25 +1,25 @@
 /*
  * Copyright (c) 2005-2007 Henri Sivonen
  * Copyright (c) 2007-2015 Mozilla Foundation
- * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 
+ * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
  * Foundation, and Opera Software ASA.
  *
- * Permission is hereby granted, free of charge, to any person obtaining a 
- * copy of this software and associated documentation files (the "Software"), 
- * to deal in the Software without restriction, including without limitation 
- * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
- * and/or sell copies of the Software, and to permit persons to whom the 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in 
+ * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
 
@@ -217,6 +217,7 @@ nsHtml5Tokenizer::emitOrAppendCharRefBuf(int32_t returnState)
   } else {
     if (charRefBufLen > 0) {
       tokenHandler->characters(charRefBuf, 0, charRefBufLen);
+      charRefBufLen = 0;
     }
   }
 }
@@ -224,13 +225,16 @@ nsHtml5Tokenizer::emitOrAppendCharRefBuf(int32_t returnState)
 nsString* 
 nsHtml5Tokenizer::strBufToString()
 {
-  return nsHtml5Portability::newStringFromBuffer(strBuf, 0, strBufLen, tokenHandler);
+  nsString* str = nsHtml5Portability::newStringFromBuffer(strBuf, 0, strBufLen, tokenHandler);
+  clearStrBufAfterUse();
+  return str;
 }
 
 void 
 nsHtml5Tokenizer::strBufToDoctypeName()
 {
   doctypeName = nsHtml5Portability::newLocalNameFromBuffer(strBuf, 0, strBufLen, interner);
+  clearStrBufAfterUse();
 }
 
 void 
@@ -238,6 +242,7 @@ nsHtml5Tokenizer::emitStrBuf()
 {
   if (strBufLen > 0) {
     tokenHandler->characters(strBuf, 0, strBufLen);
+    clearStrBufAfterUse();
   }
 }
 
@@ -259,6 +264,7 @@ void
 nsHtml5Tokenizer::emitComment(int32_t provisionalHyphens, int32_t pos)
 {
   tokenHandler->comment(strBuf, 0, strBufLen - provisionalHyphens);
+  clearStrBufAfterUse();
   cstart = pos + 1;
 }
 
@@ -275,6 +281,7 @@ void
 nsHtml5Tokenizer::strBufToElementNameString()
 {
   tagName = nsHtml5ElementName::elementNameByBuffer(strBuf, 0, strBufLen, interner);
+  clearStrBufAfterUse();
 }
 
 int32_t 
@@ -316,6 +323,7 @@ void
 nsHtml5Tokenizer::attributeNameComplete()
 {
   attributeName = nsHtml5AttributeName::nameByBuffer(strBuf, 0, strBufLen, interner);
+  clearStrBufAfterUse();
   if (!attributes) {
     attributes = new nsHtml5HtmlAttributes(0);
   }
@@ -333,6 +341,8 @@ nsHtml5Tokenizer::addAttributeWithoutValue()
   if (attributeName) {
     attributes->addAttribute(attributeName, nsHtml5Portability::newEmptyString(), attributeLine);
     attributeName = nullptr;
+  } else {
+    clearStrBufAfterUse();
   }
 }
 
@@ -346,6 +356,8 @@ nsHtml5Tokenizer::addAttributeWithValue()
     }
     attributes->addAttribute(attributeName, val, attributeLine);
     attributeName = nullptr;
+  } else {
+    clearStrBufAfterUse();
   }
 }
 
@@ -427,7 +439,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
           switch(c) {
             case '&': {
               flushChars(buf, pos);
-              clearCharRefBufAndAppend(c);
+              MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!");
+              appendCharRefBuf(c);
               setAdditionalAndRememberAmpersandLocation('\0');
               returnState = state;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -464,12 +477,14 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
           c = checkChar(buf, pos);
           if (c >= 'A' && c <= 'Z') {
             endTag = false;
-            clearStrBufAndAppend((char16_t) (c + 0x20));
+            clearStrBufBeforeUse();
+            appendStrBuf((char16_t) (c + 0x20));
             state = P::transition(mViewSource, NS_HTML5TOKENIZER_TAG_NAME, reconsume, pos);
             NS_HTML5_BREAK(tagopenloop);
           } else if (c >= 'a' && c <= 'z') {
             endTag = false;
-            clearStrBufAndAppend(c);
+            clearStrBufBeforeUse();
+            appendStrBuf(c);
             state = P::transition(mViewSource, NS_HTML5TOKENIZER_TAG_NAME, reconsume, pos);
             NS_HTML5_BREAK(tagopenloop);
           }
@@ -490,7 +505,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (P::reportErrors) {
                 errProcessingInstruction();
               }
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -617,7 +633,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
                 c += 0x20;
               }
               attributeLine = line;
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_NAME, reconsume, pos);
               NS_HTML5_BREAK(beforeattributenameloop);
             }
@@ -710,13 +727,13 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             }
             case '\"': {
               attributeLine = line;
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
               NS_HTML5_BREAK(beforeattributevalueloop);
             }
             case '&': {
               attributeLine = line;
-              clearStrBuf();
+              clearStrBufBeforeUse();
               reconsume = true;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
 
@@ -724,7 +741,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             }
             case '\'': {
               attributeLine = line;
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -751,7 +768,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             }
             default: {
               attributeLine = line;
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
 
               NS_HTML5_CONTINUE(stateloop);
@@ -777,7 +795,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               NS_HTML5_BREAK(attributevaluedoublequotedloop);
             }
             case '&': {
-              clearCharRefBufAndAppend(c);
+              MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!");
+              appendCharRefBuf(c);
               setAdditionalAndRememberAmpersandLocation('\"');
               returnState = state;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -897,7 +916,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               NS_HTML5_CONTINUE(stateloop);
             }
             case '&': {
-              clearCharRefBufAndAppend(c);
+              MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!");
+              appendCharRefBuf(c);
               setAdditionalAndRememberAmpersandLocation('>');
               returnState = state;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -982,7 +1002,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (c >= 'A' && c <= 'Z') {
                 c += 0x20;
               }
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_ATTRIBUTE_NAME, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -997,20 +1018,23 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
           c = checkChar(buf, pos);
           switch(c) {
             case '-': {
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_MARKUP_DECLARATION_HYPHEN, reconsume, pos);
               NS_HTML5_BREAK(markupdeclarationopenloop);
             }
             case 'd':
             case 'D': {
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               index = 0;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_MARKUP_DECLARATION_OCTYPE, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
             case '[': {
               if (tokenHandler->cdataSectionAllowed()) {
-                clearStrBufAndAppend(c);
+                clearStrBufBeforeUse();
+                appendStrBuf(c);
                 index = 0;
                 state = P::transition(mViewSource, NS_HTML5TOKENIZER_CDATA_START, reconsume, pos);
                 NS_HTML5_CONTINUE(stateloop);
@@ -1020,7 +1044,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (P::reportErrors) {
                 errBogusComment();
               }
-              clearStrBuf();
+              clearStrBufBeforeUse();
               reconsume = true;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
@@ -1040,7 +1064,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               NS_HTML5_BREAK(stateloop);
             }
             case '-': {
-              clearStrBuf();
+              clearStrBufAfterOneHyphen();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_COMMENT_START, reconsume, pos);
               NS_HTML5_BREAK(markupdeclarationhyphenloop);
             }
@@ -1304,6 +1328,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             index++;
             continue;
           } else {
+            clearStrBufAfterUse();
             cstart = pos;
             reconsume = true;
             state = P::transition(mViewSource, NS_HTML5TOKENIZER_CDATA_SECTION, reconsume, pos);
@@ -1411,7 +1436,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               NS_HTML5_CONTINUE(stateloop);
             }
             case '&': {
-              clearCharRefBufAndAppend(c);
+              MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!");
+              appendCharRefBuf(c);
               setAdditionalAndRememberAmpersandLocation('\'');
               returnState = state;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -1643,6 +1669,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             }
           }
           bool earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
+          charRefBufLen = 0;
           if (!(returnState & NS_HTML5TOKENIZER_DATA_AND_RCDATA_MASK)) {
             cstart = earlyBreak ? pos + 1 : pos;
           }
@@ -1736,6 +1763,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
         decimalloop_end: ;
       }
       case NS_HTML5TOKENIZER_HANDLE_NCR_VALUE: {
+        charRefBufLen = 0;
         handleNcrValue(returnState);
         state = P::transition(mViewSource, returnState, reconsume, pos);
         NS_HTML5_CONTINUE(stateloop);
@@ -1861,7 +1889,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             if (P::reportErrors) {
               errGarbageAfterLtSlash();
             }
-            clearStrBufAndAppend('\n');
+            clearStrBufBeforeUse();
+            appendStrBuf('\n');
             state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos);
             NS_HTML5_BREAK(stateloop);
           }
@@ -1870,7 +1899,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             if (P::reportErrors) {
               errGarbageAfterLtSlash();
             }
-            clearStrBufAndAppend('\n');
+            clearStrBufBeforeUse();
+            appendStrBuf(c);
             state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos);
             NS_HTML5_CONTINUE(stateloop);
           }
@@ -1883,14 +1913,16 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             }
             if (c >= 'a' && c <= 'z') {
               endTag = true;
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_TAG_NAME, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             } else {
               if (P::reportErrors) {
                 errGarbageAfterLtSlash();
               }
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_BOGUS_COMMENT, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -1910,7 +1942,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
           switch(c) {
             case '&': {
               flushChars(buf, pos);
-              clearCharRefBufAndAppend(c);
+              MOZ_ASSERT(!charRefBufLen, "charRefBufLen not reset after previous use!");
+              appendCharRefBuf(c);
               setAdditionalAndRememberAmpersandLocation('\0');
               returnState = state;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_CONSUME_CHARACTER_REFERENCE, reconsume, pos);
@@ -1984,7 +2017,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
           switch(c) {
             case '/': {
               index = 0;
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME, reconsume, pos);
               NS_HTML5_BREAK(rawtextrcdatalessthansignloop);
             }
@@ -2028,6 +2061,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
             switch(c) {
               case '\r': {
                 silentCarriageReturn();
+                clearStrBufAfterUse();
                 state = P::transition(mViewSource, NS_HTML5TOKENIZER_BEFORE_ATTRIBUTE_NAME, reconsume, pos);
                 NS_HTML5_BREAK(stateloop);
               }
@@ -2037,14 +2071,17 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               case ' ':
               case '\t':
               case '\f': {
+                clearStrBufAfterUse();
                 state = P::transition(mViewSource, NS_HTML5TOKENIZER_BEFORE_ATTRIBUTE_NAME, reconsume, pos);
                 NS_HTML5_CONTINUE(stateloop);
               }
               case '/': {
+                clearStrBufAfterUse();
                 state = P::transition(mViewSource, NS_HTML5TOKENIZER_SELF_CLOSING_START_TAG, reconsume, pos);
                 NS_HTML5_CONTINUE(stateloop);
               }
               case '>': {
+                clearStrBufAfterUse();
                 state = P::transition(mViewSource, emitCurrentTagToken(false, pos), reconsume, pos);
                 if (shouldSuspend) {
                   NS_HTML5_BREAK(stateloop);
@@ -2188,7 +2225,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
           switch(c) {
             case '/': {
               index = 0;
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -2373,7 +2410,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
           switch(c) {
             case '/': {
               index = 0;
-              clearStrBuf();
+              clearStrBufBeforeUse();
               returnState = NS_HTML5TOKENIZER_SCRIPT_DATA_ESCAPED;
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
@@ -2729,7 +2766,8 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (c >= 'A' && c <= 'Z') {
                 c += 0x20;
               }
-              clearStrBufAndAppend(c);
+              clearStrBufBeforeUse();
+              appendStrBuf(c);
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_NAME, reconsume, pos);
               NS_HTML5_BREAK(beforedoctypenameloop);
             }
@@ -2881,7 +2919,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (P::reportErrors) {
                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
               }
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -2889,7 +2927,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (P::reportErrors) {
                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
               }
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -2931,12 +2969,12 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               continue;
             }
             case '\"': {
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
               NS_HTML5_BREAK(beforedoctypepublicidentifierloop);
             }
             case '\'': {
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -3029,7 +3067,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (P::reportErrors) {
                 errNoSpaceBetweenPublicAndSystemIds();
               }
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -3037,7 +3075,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (P::reportErrors) {
                 errNoSpaceBetweenPublicAndSystemIds();
               }
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -3075,12 +3113,12 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               NS_HTML5_CONTINUE(stateloop);
             }
             case '\"': {
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
               NS_HTML5_BREAK(betweendoctypepublicandsystemidentifiersloop);
             }
             case '\'': {
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -3252,7 +3290,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (P::reportErrors) {
                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
               }
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -3260,7 +3298,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               if (P::reportErrors) {
                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
               }
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
@@ -3302,12 +3340,12 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
               continue;
             }
             case '\"': {
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
             }
             case '\'': {
-              clearStrBuf();
+              clearStrBufBeforeUse();
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
               NS_HTML5_BREAK(beforedoctypesystemidentifierloop);
             }
@@ -3455,6 +3493,7 @@ nsHtml5Tokenizer::stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* bu
 void 
 nsHtml5Tokenizer::initDoctypeFields()
 {
+  clearStrBufAfterUse();
   doctypeName = nsHtml5Atoms::emptystring;
   if (systemIdentifier) {
     nsHtml5Portability::releaseString(systemIdentifier);
@@ -3607,7 +3646,6 @@ nsHtml5Tokenizer::eof()
       }
       case NS_HTML5TOKENIZER_MARKUP_DECLARATION_OPEN: {
         errBogusComment();
-        clearStrBuf();
         emitComment(0, 0);
         NS_HTML5_BREAK(eofloop);
       }
@@ -3813,6 +3851,7 @@ nsHtml5Tokenizer::eof()
               tokenHandler->characters(charRefBuf, charRefBufMark, charRefBufLen - charRefBufMark);
             }
           }
+          charRefBufLen = 0;
           state = returnState;
           NS_HTML5_CONTINUE(eofloop);
         }
@@ -3935,7 +3974,7 @@ nsHtml5Tokenizer::isInDataState()
 void 
 nsHtml5Tokenizer::resetToDataState()
 {
-  strBufLen = 0;
+  clearStrBufAfterUse();
   charRefBufLen = 0;
   stateSave = NS_HTML5TOKENIZER_DATA;
   lastCR = false;
diff --git a/parser/html/nsHtml5Tokenizer.h b/parser/html/nsHtml5Tokenizer.h
index 16f70889f5ea..da509b69b439 100644
--- a/parser/html/nsHtml5Tokenizer.h
+++ b/parser/html/nsHtml5Tokenizer.h
@@ -1,25 +1,25 @@
 /*
  * Copyright (c) 2005-2007 Henri Sivonen
  * Copyright (c) 2007-2015 Mozilla Foundation
- * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 
+ * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
  * Foundation, and Opera Software ASA.
  *
- * Permission is hereby granted, free of charge, to any person obtaining a 
- * copy of this software and associated documentation files (the "Software"), 
- * to deal in the Software without restriction, including without limitation 
- * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
- * and/or sell copies of the Software, and to permit persons to whom the 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in 
+ * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
 
@@ -162,21 +162,22 @@ class nsHtml5Tokenizer
       charRefBuf[charRefBufLen++] = c;
     }
 
-    inline void clearCharRefBufAndAppend(char16_t c)
-    {
-      charRefBuf[0] = c;
-      charRefBufLen = 1;
-    }
-
     void emitOrAppendCharRefBuf(int32_t returnState);
-    inline void clearStrBufAndAppend(char16_t c)
+    inline void clearStrBufAfterUse()
     {
-      strBuf[0] = c;
-      strBufLen = 1;
+      strBufLen = 0;
     }
 
-    inline void clearStrBuf()
+    inline void clearStrBufBeforeUse()
     {
+      MOZ_ASSERT(!strBufLen, "strBufLen not reset after previous use!");
+      strBufLen = 0;
+    }
+
+    inline void clearStrBufAfterOneHyphen()
+    {
+      MOZ_ASSERT(strBufLen == 1, "strBufLen length not one!");
+      MOZ_ASSERT(strBuf[0] == '-', "strBuf does not start with a hyphen!");
       strBufLen = 0;
     }
 
@@ -211,6 +212,7 @@ class nsHtml5Tokenizer
     inline void appendCharRefBufToStrBuf()
     {
       appendStrBuf(charRefBuf, 0, charRefBufLen);
+      charRefBufLen = 0;
     }
 
     void emitComment(int32_t provisionalHyphens, int32_t pos);