From ac49929194781ebd3e82876ef5720783e5dfd0c4 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Fri, 30 Jul 2010 13:03:54 +0300 Subject: [PATCH] Bug 582788 - Treat non-ASCII-superset encoding the same way in late meta handling as in meta prescan. r=bzbarsky, a=blocking2.0-betaN. --HG-- extra : rebase_source : 53407afad2a7304d77c1faa3e43301db4fa84ff2 --- parser/html/nsHtml5MetaScannerCppSupplement.h | 34 +++++++++---------- parser/html/nsHtml5StreamParser.cpp | 29 ++++++++++++++-- .../tests/reftest/bug582788-1-ref.html | 11 ++++++ .../htmlparser/tests/reftest/bug582788-1.html | 11 ++++++ parser/htmlparser/tests/reftest/reftest.list | 2 +- 5 files changed, 67 insertions(+), 20 deletions(-) create mode 100644 parser/htmlparser/tests/reftest/bug582788-1-ref.html create mode 100644 parser/htmlparser/tests/reftest/bug582788-1.html diff --git a/parser/html/nsHtml5MetaScannerCppSupplement.h b/parser/html/nsHtml5MetaScannerCppSupplement.h index ba31349df5b1..721a233b5b42 100644 --- a/parser/html/nsHtml5MetaScannerCppSupplement.h +++ b/parser/html/nsHtml5MetaScannerCppSupplement.h @@ -76,6 +76,9 @@ nsHtml5MetaScanner::sniff(nsHtml5ByteReadable* bytes, nsIUnicodeDecoder** decode PRBool nsHtml5MetaScanner::tryCharset(nsString* charset) { + // This code needs to stay in sync with + // nsHtml5StreamParser::internalEncodingDeclaration. Unfortunately, the + // trickery with member fields here leads to some copy-paste reuse. :-( nsresult res = NS_OK; nsCOMPtr convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &res); if (NS_FAILED(res)) { @@ -85,12 +88,9 @@ nsHtml5MetaScanner::tryCharset(nsString* charset) nsCAutoString encoding; CopyUTF16toUTF8(*charset, encoding); // XXX spec says only UTF-16 - if (encoding.LowerCaseEqualsASCII("utf-16") || - encoding.LowerCaseEqualsASCII("utf-16be") || - encoding.LowerCaseEqualsASCII("utf-16le") || - encoding.LowerCaseEqualsASCII("utf-32") || - encoding.LowerCaseEqualsASCII("utf-32be") || - encoding.LowerCaseEqualsASCII("utf-32le")) { + if (encoding.LowerCaseEqualsLiteral("utf-16") || + encoding.LowerCaseEqualsLiteral("utf-16be") || + encoding.LowerCaseEqualsLiteral("utf-16le")) { mCharset.Assign("UTF-8"); res = convManager->GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder)); if (NS_FAILED(res)) { @@ -109,17 +109,17 @@ nsHtml5MetaScanner::tryCharset(nsString* charset) if (NS_FAILED(res)) { return PR_FALSE; } - if (preferred.LowerCaseEqualsASCII("utf-16") || - preferred.LowerCaseEqualsASCII("utf-16be") || - preferred.LowerCaseEqualsASCII("utf-16le") || - preferred.LowerCaseEqualsASCII("utf-32") || - preferred.LowerCaseEqualsASCII("utf-32be") || - preferred.LowerCaseEqualsASCII("utf-32le") || - preferred.LowerCaseEqualsASCII("utf-7") || - preferred.LowerCaseEqualsASCII("jis_x0212-1990") || - preferred.LowerCaseEqualsASCII("x-jis0208") || - preferred.LowerCaseEqualsASCII("x-imap4-modified-utf7") || - preferred.LowerCaseEqualsASCII("x-user-defined")) { + if (preferred.LowerCaseEqualsLiteral("utf-16") || + preferred.LowerCaseEqualsLiteral("utf-16be") || + preferred.LowerCaseEqualsLiteral("utf-16le") || + preferred.LowerCaseEqualsLiteral("utf-32") || + preferred.LowerCaseEqualsLiteral("utf-32be") || + preferred.LowerCaseEqualsLiteral("utf-32le") || + preferred.LowerCaseEqualsLiteral("utf-7") || + preferred.LowerCaseEqualsLiteral("jis_x0212-1990") || + preferred.LowerCaseEqualsLiteral("x-jis0208") || + preferred.LowerCaseEqualsLiteral("x-imap4-modified-utf7") || + preferred.LowerCaseEqualsLiteral("x-user-defined")) { return PR_FALSE; } res = convManager->GetUnicodeDecoderRaw(preferred.get(), getter_AddRefs(mUnicodeDecoder)); diff --git a/parser/html/nsHtml5StreamParser.cpp b/parser/html/nsHtml5StreamParser.cpp index f0483e4dbf8e..94831d57a070 100644 --- a/parser/html/nsHtml5StreamParser.cpp +++ b/parser/html/nsHtml5StreamParser.cpp @@ -749,6 +749,9 @@ nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest, void nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding) { + // This code needs to stay in sync with + // nsHtml5MetaScanner::tryCharset. Unfortunately, the + // trickery with member fields there leads to some copy-paste reuse. :-( NS_ASSERTION(IsParserThread(), "Wrong thread!"); if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec return; @@ -758,14 +761,21 @@ nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding) return; // not reparsing even if we wanted to } + nsCAutoString newEncoding; + CopyUTF16toUTF8(*aEncoding, newEncoding); + // XXX spec says only UTF-16 + if (newEncoding.LowerCaseEqualsLiteral("utf-16") || + newEncoding.LowerCaseEqualsLiteral("utf-16be") || + newEncoding.LowerCaseEqualsLiteral("utf-16le")) { + newEncoding.Assign("UTF-8"); + } + nsresult rv = NS_OK; nsCOMPtr calias(do_GetService(kCharsetAliasCID, &rv)); if (NS_FAILED(rv)) { NS_NOTREACHED("Charset alias service not available."); return; } - nsCAutoString newEncoding; - CopyUTF16toUTF8(*aEncoding, newEncoding); PRBool eq; rv = calias->Equals(newEncoding, mCharset, &eq); if (NS_FAILED(rv)) { @@ -787,6 +797,21 @@ nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding) return; } + if (preferred.LowerCaseEqualsLiteral("utf-16") || + preferred.LowerCaseEqualsLiteral("utf-16be") || + preferred.LowerCaseEqualsLiteral("utf-16le") || + preferred.LowerCaseEqualsLiteral("utf-32") || + preferred.LowerCaseEqualsLiteral("utf-32be") || + preferred.LowerCaseEqualsLiteral("utf-32le") || + preferred.LowerCaseEqualsLiteral("utf-7") || + preferred.LowerCaseEqualsLiteral("jis_x0212-1990") || + preferred.LowerCaseEqualsLiteral("x-jis0208") || + preferred.LowerCaseEqualsLiteral("x-imap4-modified-utf7") || + preferred.LowerCaseEqualsLiteral("x-user-defined")) { + // Not a rough ASCII superset + return; + } + mTreeBuilder->NeedsCharsetSwitchTo(preferred); FlushTreeOpsAndDisarmTimer(); Interrupt(); diff --git a/parser/htmlparser/tests/reftest/bug582788-1-ref.html b/parser/htmlparser/tests/reftest/bug582788-1-ref.html new file mode 100644 index 000000000000..c1f684807a6e --- /dev/null +++ b/parser/htmlparser/tests/reftest/bug582788-1-ref.html @@ -0,0 +1,11 @@ + + + + +Not ISO-10646 + + +

Not ISO-10646

+ + + diff --git a/parser/htmlparser/tests/reftest/bug582788-1.html b/parser/htmlparser/tests/reftest/bug582788-1.html new file mode 100644 index 000000000000..ee31b3de9dca --- /dev/null +++ b/parser/htmlparser/tests/reftest/bug582788-1.html @@ -0,0 +1,11 @@ + + + + +Not ISO-10646 + + +

Not ISO-10646

+ + + diff --git a/parser/htmlparser/tests/reftest/reftest.list b/parser/htmlparser/tests/reftest/reftest.list index ffced6239bb7..98d7834d9039 100644 --- a/parser/htmlparser/tests/reftest/reftest.list +++ b/parser/htmlparser/tests/reftest/reftest.list @@ -1,2 +1,2 @@ == bug566280-1.html bug566280-1-ref.html - +== bug582788-1.html bug582788-1-ref.html