diff --git a/netwerk/streamconv/converters/nsUnknownDecoder.cpp b/netwerk/streamconv/converters/nsUnknownDecoder.cpp index 73358629764f..e8885aa68d04 100644 --- a/netwerk/streamconv/converters/nsUnknownDecoder.cpp +++ b/netwerk/streamconv/converters/nsUnknownDecoder.cpp @@ -48,6 +48,8 @@ #include "nsIPref.h" #include "imgILoader.h" +#include "nsCRT.h" + #include "nsIMIMEService.h" #include "nsIViewSourceChannel.h" @@ -349,53 +351,10 @@ void nsUnknownDecoder::DetermineContentType(nsIRequest* aRequest) return; } - /* - * To prevent a possible attack, we will not consider this to be - * html content if it comes from the local file system and our prefs - * are set right - */ - if (AllowSniffing(aRequest)) { - // Now look for HTML - CBufDescriptor bufDesc((const char*)mBuffer, PR_TRUE, mBufferLen, mBufferLen); - nsCAutoString str(bufDesc); - - PRInt32 offset; - - offset = str.Find("= 0) { - mContentType = TEXT_HTML; - return; - } + if (SniffForHTML(aRequest)) { + return; } - + // We don't know what this is yet. Before we just give up, try // the URI from the request. if (SniffURI(aRequest)) { @@ -419,6 +378,73 @@ PRBool nsUnknownDecoder::SniffForImageMimeType(nsIRequest* aRequest) return PR_TRUE; } +PRBool nsUnknownDecoder::SniffForHTML(nsIRequest* aRequest) +{ + /* + * To prevent a possible attack, we will not consider this to be + * html content if it comes from the local file system and our prefs + * are set right + */ + if (!AllowSniffing(aRequest)) { + return PR_FALSE; + } + + // Now look for HTML. First, we get us a nice nsCAutoString + // containing our data in a readonly-ish manner... + const CBufDescriptor bufDesc((const char*)mBuffer, PR_TRUE, mBufferLen, mBufferLen); + const nsCAutoString str(bufDesc); + + nsCAutoString::const_iterator start, end; + str.BeginReading(start); + str.EndReading(end); + PRUint32 pos = 0; // for Substring ease + + // skip leading whitespace + while (start != end && nsCRT::IsAsciiSpace(*start)) { + ++start; + ++pos; + } + + // did we find something like a start tag? + if (start == end || *start != '<' || ++start == end) { + return PR_FALSE; + } + + // advance pos to keep synch with |start| + ++pos; + + // If we seem to be SGML or XML and we got down here, just pretend we're HTML + if (*start == '!' || *start == '?') { + mContentType = TEXT_HTML; + return PR_TRUE; + } + + nsCaseInsensitiveCStringComparator comparator; + +#define MATCHES_TAG(_tagstr) \ + Substring(str, pos, sizeof(_tagstr) - 1).Equals(_tagstr, comparator) + + if (MATCHES_TAG("html") || + MATCHES_TAG("frameset") || + MATCHES_TAG("body") || + MATCHES_TAG("script") || + MATCHES_TAG("a href") || + MATCHES_TAG("img") || + MATCHES_TAG("table") || + MATCHES_TAG("title") || + MATCHES_TAG("div") || + MATCHES_TAG("applet") || + MATCHES_TAG("meta")) { + + mContentType = TEXT_HTML; + return PR_TRUE; + } + +#undef MATCHES_TAG + + return PR_FALSE; +} + PRBool nsUnknownDecoder::SniffForXML(nsIRequest* aRequest) { // Just like HTML, this should be able to be shut off. diff --git a/netwerk/streamconv/converters/nsUnknownDecoder.h b/netwerk/streamconv/converters/nsUnknownDecoder.h index 7a33b4796379..c97352038f92 100644 --- a/netwerk/streamconv/converters/nsUnknownDecoder.h +++ b/netwerk/streamconv/converters/nsUnknownDecoder.h @@ -89,8 +89,17 @@ protected: // Various sniffer functions. Returning PR_TRUE means that a type // was determined; PR_FALSE means no luck. PRBool SniffForImageMimeType(nsIRequest* aRequest); + PRBool SniffForHTML(nsIRequest* aRequest); PRBool SniffForXML(nsIRequest* aRequest); + + // SniffURI guesses at the content type based on the URI (typically + // using the extentsion) PRBool SniffURI(nsIRequest* aRequest); + + // LastDitchSniff guesses at text/plain vs. application/octet-stream + // by just looking at whether the data contains null bytes, and + // maybe at the fraction of chars with high bit set. Use this only + // as a last-ditch attempt to decide a content type! PRBool LastDitchSniff(nsIRequest* aRequest); /**