prettify HTML-detection a bit; make it a little less likely to detect HTML.

Bug 144672, r=bbaetz, sr=darin
2024-10-10 11:55:49 +00:00 · 2003-04-04 22:46:50 +00:00 · 2003-04-04 22:46:50 +00:00 · 327a0205c8
commit 327a0205c8
parent 36a26eca9c
2 changed files with 81 additions and 46 deletions
--- a/netwerk/streamconv/converters/nsUnknownDecoder.cpp
+++ b/netwerk/streamconv/converters/nsUnknownDecoder.cpp
@ -48,6 +48,8 @@
 #include "nsIPref.h"
 #include "imgILoader.h"

+#include "nsCRT.h"
+
 #include "nsIMIMEService.h"

 #include "nsIViewSourceChannel.h"
@ -349,53 +351,10 @@ void nsUnknownDecoder::DetermineContentType(nsIRequest* aRequest)
    return;
  }

-  /*
-   * To prevent a possible attack, we will not consider this to be
-   * html content if it comes from the local file system and our prefs
-   * are set right
-   */
-  if (AllowSniffing(aRequest)) {
-    // Now look for HTML
-    CBufDescriptor bufDesc((const char*)mBuffer, PR_TRUE, mBufferLen, mBufferLen);
-    nsCAutoString str(bufDesc);
-
-    PRInt32 offset;
-
-    offset = str.Find("<HTML", PR_TRUE);
-    if (offset < 0) {
-      offset = str.Find("<TITLE", PR_TRUE);
-      if (offset < 0) {
-        offset = str.Find("<FRAMESET", PR_TRUE);
-        if (offset < 0) {
-          offset = str.Find("<SCRIPT", PR_TRUE);
-          if (offset < 0) {
-            offset = str.Find("<BODY", PR_TRUE);
-            if (offset < 0) {
-              offset = str.Find("<TABLE", PR_TRUE);
-              if (offset < 0) {
-                offset = str.Find("<DIV", PR_TRUE);
-                if (offset < 0) {
-                  offset = str.Find("<A HREF", PR_TRUE);
-                  if (offset < 0) {
-                    offset = str.Find("<APPLET", PR_TRUE);
-                    if (offset < 0) {
-                      offset = str.Find("<META", PR_TRUE);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    if (offset >= 0) {
-      mContentType = TEXT_HTML;
-      return;
-    }
+  if (SniffForHTML(aRequest)) {
+    return;
  }
-
+  
  // We don't know what this is yet.  Before we just give up, try
  // the URI from the request.
  if (SniffURI(aRequest)) {
@ -419,6 +378,73 @@ PRBool nsUnknownDecoder::SniffForImageMimeType(nsIRequest* aRequest)
  return PR_TRUE;
 }

+PRBool nsUnknownDecoder::SniffForHTML(nsIRequest* aRequest)
+{
+  /*
+   * To prevent a possible attack, we will not consider this to be
+   * html content if it comes from the local file system and our prefs
+   * are set right
+   */
+  if (!AllowSniffing(aRequest)) {
+    return PR_FALSE;
+  }
+  
+  // Now look for HTML.  First, we get us a nice nsCAutoString
+  // containing our data in a readonly-ish manner...
+  const CBufDescriptor bufDesc((const char*)mBuffer, PR_TRUE, mBufferLen, mBufferLen);
+  const nsCAutoString str(bufDesc);
+
+  nsCAutoString::const_iterator start, end;
+  str.BeginReading(start);
+  str.EndReading(end);
+  PRUint32 pos = 0; // for Substring ease
+
+  // skip leading whitespace
+  while (start != end && nsCRT::IsAsciiSpace(*start)) {
+    ++start;
+    ++pos;
+  }
+
+  // did we find something like a start tag?
+  if (start == end || *start != '<' || ++start == end) {
+    return PR_FALSE;
+  }
+
+  // advance pos to keep synch with |start|
+  ++pos;
+
+  // If we seem to be SGML or XML and we got down here, just pretend we're HTML
+  if (*start == '!' || *start == '?') {
+    mContentType = TEXT_HTML;
+    return PR_TRUE;
+  }
+
+  nsCaseInsensitiveCStringComparator comparator;
+
+#define MATCHES_TAG(_tagstr) \
+  Substring(str, pos, sizeof(_tagstr) - 1).Equals(_tagstr, comparator)
+  
+  if (MATCHES_TAG("html")     ||
+      MATCHES_TAG("frameset") ||
+      MATCHES_TAG("body")     ||
+      MATCHES_TAG("script")   ||
+      MATCHES_TAG("a href")   ||
+      MATCHES_TAG("img")      ||
+      MATCHES_TAG("table")    ||
+      MATCHES_TAG("title")    ||
+      MATCHES_TAG("div")      ||
+      MATCHES_TAG("applet")   ||
+      MATCHES_TAG("meta")) {
+  
+    mContentType = TEXT_HTML;
+    return PR_TRUE;
+  }
+
+#undef MATCHES_TAG
+  
+  return PR_FALSE;
+}
+
 PRBool nsUnknownDecoder::SniffForXML(nsIRequest* aRequest)
 {
  // Just like HTML, this should be able to be shut off.
--- a/netwerk/streamconv/converters/nsUnknownDecoder.h
+++ b/netwerk/streamconv/converters/nsUnknownDecoder.h
@ -89,8 +89,17 @@ protected:
  // Various sniffer functions.  Returning PR_TRUE means that a type
  // was determined; PR_FALSE means no luck.
  PRBool SniffForImageMimeType(nsIRequest* aRequest);
+  PRBool SniffForHTML(nsIRequest* aRequest);
  PRBool SniffForXML(nsIRequest* aRequest);
+
+  // SniffURI guesses at the content type based on the URI (typically
+  // using the extentsion)
  PRBool SniffURI(nsIRequest* aRequest);
+
+  // LastDitchSniff guesses at text/plain vs. application/octet-stream
+  // by just looking at whether the data contains null bytes, and
+  // maybe at the fraction of chars with high bit set.  Use this only
+  // as a last-ditch attempt to decide a content type!
  PRBool LastDitchSniff(nsIRequest* aRequest);

  /**