Bug 551344 part 3 - Address sicking's review comments in nsHtml5StreamParser.cpp. r=jonas.

2025-02-22 18:32:00 +00:00 · 2010-04-09 17:11:56 +03:00 · 2010-04-09 17:11:56 +03:00 · 2c9733fb25
commit 2c9733fb25
parent b1b0c735df
1 changed files with 29 additions and 17 deletions
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@ -245,7 +245,7 @@ nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const
  NS_ENSURE_SUCCESS(rv, rv);
  rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
  if (rv == NS_ERROR_UCONV_NOCONV) {
-    mCharset.Assign("windows-1252"); // lower case is the raw form
+    mCharset.AssignLiteral("windows-1252"); // lower case is the raw form
    mCharsetSource = kCharsetFromWeakDocTypeDefault;
    rv = convManager->GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
    mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
@ -284,6 +284,7 @@ nsHtml5StreamParser::SetupDecodingFromBom(const char* aCharsetName, const char*
  NS_ENSURE_SUCCESS(rv, rv);
  rv = convManager->GetUnicodeDecoderRaw(aDecoderCharsetName, getter_AddRefs(mUnicodeDecoder));
  NS_ENSURE_SUCCESS(rv, rv);
+  mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
  mCharset.Assign(aCharsetName);
  mCharsetSource = kCharsetFromByteOrderMark;
  mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
@ -322,7 +323,7 @@ nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be nul
  }
  if (mCharsetSource == kCharsetUninitialized) {
    // Hopefully this case is never needed, but dealing with it anyway
-    mCharset.Assign("windows-1252");
+    mCharset.AssignLiteral("windows-1252");
    mCharsetSource = kCharsetFromWeakDocTypeDefault;
    mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
  }
@ -337,7 +338,7 @@ nsHtml5StreamParser::SniffStreamBytes(const PRUint8* aFromSegment,
  NS_ASSERTION(IsParserThread(), "Wrong thread!");
  nsresult rv = NS_OK;
  PRUint32 writeCount;
-  for (PRUint32 i = 0; i < aCount; i++) {
+  for (PRUint32 i = 0; i < aCount && mBomState != BOM_SNIFFING_OVER; i++) {
    switch (mBomState) {
      case BOM_SNIFFING_NOT_STARTED:
        NS_ASSERTION(i == 0, "Bad BOM sniffing state.");
@ -400,11 +401,11 @@ nsHtml5StreamParser::SniffStreamBytes(const PRUint8* aFromSegment,
        mBomState = BOM_SNIFFING_OVER;
        break;
      default:
-        goto bom_loop_end;
+        mBomState = BOM_SNIFFING_OVER;
+        break;
    }
  }
  // if we get here, there either was no BOM or the BOM sniffing isn't complete yet
-  bom_loop_end:
  
  if (!mMetaScanner) {
    mMetaScanner = new nsHtml5MetaScanner();
@ -431,6 +432,7 @@ nsHtml5StreamParser::SniffStreamBytes(const PRUint8* aFromSegment,
  mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
  if (mUnicodeDecoder) {
    // meta scan successful
+    mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
    mCharsetSource = kCharsetFromMetaPrescan;
    mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
    mMetaScanner = nsnull;
@ -473,23 +475,33 @@ nsHtml5StreamParser::WriteStreamBytes(const PRUint8* aFromSegment,
    NS_ASSERTION(mLastBuffer->getEnd() <= NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE, "The Unicode decoder wrote too much data.");

    if (NS_FAILED(convResult)) {
-      if (totalByteCount < aCount) { // mimicking nsScanner even though this seems wrong
+      // There's an illegal byte in the input. It's now the responsibility
+      // of this calling code to output a U+FFFD REPLACEMENT CHARACTER and
+      // reset the decoder.
+
+      NS_ASSERTION(totalByteCount < aCount,
+                   "The decoder signaled an error but consumed all input.");
+      if (totalByteCount < aCount) {
+        // advance over the bad byte
        ++totalByteCount;
        ++aFromSegment;
      }
+
+      // Emit the REPLACEMENT CHARACTER
      mLastBuffer->getBuffer()[end] = 0xFFFD;
      ++end;
      mLastBuffer->setEnd(end);
      if (end == NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE) {
-          mLastBuffer = (mLastBuffer->next = new nsHtml5UTF16Buffer(NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE));
+          mLastBuffer = mLastBuffer->next = new nsHtml5UTF16Buffer(NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
      }
+
      mUnicodeDecoder->Reset();
      if (totalByteCount == aCount) {
        *aWriteCount = totalByteCount;
        return NS_OK;
      }
    } else if (convResult == NS_PARTIAL_MORE_OUTPUT) {
-      mLastBuffer = (mLastBuffer->next = new nsHtml5UTF16Buffer(NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE));
+      mLastBuffer = mLastBuffer->next = new nsHtml5UTF16Buffer(NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
      NS_ASSERTION(totalByteCount < aCount, "The Unicode decoder has consumed too many bytes.");
    } else {
      NS_ASSERTION(totalByteCount == aCount, "The Unicode decoder consumed the wrong number of bytes.");
@ -702,7 +714,6 @@ nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
    return;
  }

-  // The encodings are different.
  if (mReparseForbidden) {
    return; // not reparsing even if we wanted to
  }
@ -736,7 +747,6 @@ nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
    return;
  }
  
-  // we still want to reparse
  mTreeBuilder->NeedsCharsetSwitchTo(preferred);
  mTreeBuilder->Flush();
  Interrupt();
@ -779,7 +789,7 @@ nsHtml5StreamParser::ParseAvailableData()
            return; // no more data for now but expecting more
          case STREAM_ENDED:
            if (mAtEOF) {
-                return;
+              return;
            }
            mAtEOF = PR_TRUE;
            mTokenizer->eof();
@ -793,10 +803,9 @@ nsHtml5StreamParser::ParseAvailableData()
            NS_NOTREACHED("It should be impossible to reach this.");
            return;
        }
-      } else {
-        mFirstBuffer = mFirstBuffer->next;
-        continue;
      }
+      mFirstBuffer = mFirstBuffer->next;
+      continue;
    }

    // now we have a non-empty buffer
@ -864,6 +873,7 @@ nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
    if (mSpeculations.IsEmpty()) {
      // Not quite sure how exactly this happens...
      // Maybe an artifact of defer scripts?
+      NS_WARNING("ContinueAfterScripts called without speculations.");
      return;
    }
    nsHtml5Speculation* speculation = mSpeculations.ElementAt(0);
@ -901,9 +911,11 @@ nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
  {
    mozilla::MutexAutoLock tokenizerAutoLock(mTokenizerMutex);
    #ifdef DEBUG
+    {
      nsCOMPtr<nsIThread> mainThread;
      NS_GetMainThread(getter_AddRefs(mainThread));
      mAtomTable.SetPermittedLookupThread(mainThread);
+    }
    #endif
    // In principle, the speculation mutex should be acquired here,
    // but there's no point, because the parser thread only acquires it
@ -931,7 +943,7 @@ nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
      mTreeBuilder->SetOpSink(mExecutor->GetStage());
      mExecutor->StartReadingFromStage();
      mSpeculating = PR_FALSE;
-      mFlushTimer->Cancel(); // just in case
+      mFlushTimer->Cancel();
      mFlushTimer->InitWithFuncCallback(nsHtml5StreamParser::TimerCallback, 
                                        static_cast<void*> (this), 
                                        sTimerContinueDelay, 
@ -956,7 +968,7 @@ nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
        mTreeBuilder->SetOpSink(mExecutor->GetStage());
        mExecutor->StartReadingFromStage();
        mSpeculating = PR_FALSE;
-        mFlushTimer->Cancel(); // just in case
+        mFlushTimer->Cancel();
        mFlushTimer->InitWithFuncCallback(nsHtml5StreamParser::TimerCallback, 
                                          static_cast<void*> (this), 
                                          sTimerContinueDelay, 
@ -1042,7 +1054,7 @@ nsHtml5StreamParser::PostTimerFlush()
 {
  NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");

-  mFlushTimer->Cancel(); // just in case
+  mFlushTimer->Cancel();

  // The following line reads a mutex-protected variable without acquiring 
  // the mutex. This is OK, because failure to exit early here is harmless.