From e7ba49093b3b35fa2253c5438cd2ccf5dae445b5 Mon Sep 17 00:00:00 2001
From: "tony%ponderer.org" <tony%ponderer.org>
Date: Mon, 5 Mar 2007 05:58:05 +0000
Subject: [PATCH] Bug 368998: when normalizing hostnames, we don't properly
 escape non-alphanumerics patch: move host encoding to c++ (url classifier
 utils component) r=bryner

---
 .../components/build/nsToolkitCompsModule.cpp |  2 +-
 .../content/enchash-decrypter.js              | 93 +------------------
 .../public/nsIUrlClassifierUtils.idl          | 10 +-
 .../src/nsUrlClassifierUtils.cpp              | 41 +++++++-
 .../url-classifier/src/nsUrlClassifierUtils.h | 36 +++++++
 .../tests/test_enchash-decrypter.xhtml        | 38 ++++++--
 6 files changed, 119 insertions(+), 101 deletions(-)

diff --git a/toolkit/components/build/nsToolkitCompsModule.cpp b/toolkit/components/build/nsToolkitCompsModule.cpp
index 32358a399b2a..80e473a15174 100644
--- a/toolkit/components/build/nsToolkitCompsModule.cpp
+++ b/toolkit/components/build/nsToolkitCompsModule.cpp
@@ -91,7 +91,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsTypeAheadFind)
 NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsUrlClassifierDBService,
                                          nsUrlClassifierDBService::GetInstance)
 NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierStreamUpdater)
-NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierUtils)
+NS_GENERIC_FACTORY_CONSTRUCTOR_INIT(nsUrlClassifierUtils, Init)
 #endif
 
 #ifdef MOZ_FEEDS
diff --git a/toolkit/components/url-classifier/content/enchash-decrypter.js b/toolkit/components/url-classifier/content/enchash-decrypter.js
index bfdda06e0b47..c5643e5299f1 100644
--- a/toolkit/components/url-classifier/content/enchash-decrypter.js
+++ b/toolkit/components/url-classifier/content/enchash-decrypter.js
@@ -53,36 +53,6 @@
 //
 // TODO: accommodate other kinds of perl-but-not-javascript qualifiers
 
-/**
- * A fast, bit-vector map for ascii characters.
- *
- * Internally stores 256 bits in an array of 8 ints.
- * Does quick bit-flicking to lookup needed characters.
- */
-
-/**
- * @param Takes 8 ints to initialize the character map
- */
-function Charmap() {
-  if (arguments.length != 8) {
-    throw G_Error("charmap ctor requires 8 int args");
-  }
-  this.map_ = [];
-  for (var i = 0; i < 8; ++i) {
-    this.map_.push(arguments[i]);
-  }
-}
-
-/**
- * Do a quick lookup to see if the letter is in the map.
- * @param chr String of length 1 (ascii)
- * @return Boolean true if the letter is in the map
- */
-Charmap.prototype.contains = function(chr) {
-  var val = chr.charCodeAt(0);
-  return !!(this.map_[val >> 5] & (1 << (val & 31)));
-}
-
 /**
  * This thing knows how to generate lookup keys and decrypt values found in
  * a table of type enchash.
@@ -94,10 +64,6 @@ function PROT_EnchashDecrypter() {
   this.base64_ = new G_Base64();
   this.streamCipher_ = Cc["@mozilla.org/security/streamcipher;1"]
                        .createInstance(Ci.nsIStreamCipher);
-  // Everything but alpha numerics, - and .
-  this.escapeCharmap_ = new Charmap(
-    0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
-    0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
 }
 
 PROT_EnchashDecrypter.DATABASE_SALT = "oU3q.72p";
@@ -141,38 +107,6 @@ PROT_EnchashDecrypter.prototype.lastNChars_ = function(str, n) {
   return str.substr(n);
 }
 
-/**
- * We have to have our own hex-decoder because decodeURIComponent
- * expects UTF-8 (so it will barf on invalid UTF-8 sequences).
- *
- * @param str String to decode
- * 
- * @returns The decoded string
- */
-PROT_EnchashDecrypter.prototype.hexDecode_ = function(str) {
-  var output = [];
-
-  var i = 0;
-  while (i < str.length) {
-    var c = str.charAt(i);
-  
-    if (c == "%" && i + 2 < str.length) {
-
-      var asciiVal = Number("0x" + str.charAt(i + 1) + str.charAt(i + 2));
-      
-      if (!isNaN(asciiVal)) {
-        i += 2;
-        c = String.fromCharCode(asciiVal);
-      }
-    }
-    
-    output[output.length] = c;
-    ++i;
-  }
-  
-  return output.join("");
-}
-
 /**
  * Translate a plaintext enchash value into regular expressions
  *
@@ -237,7 +171,7 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
     return "";
   }
 
-  var unescaped = this.hexDecode_(asciiHost);
+  var unescaped = unescape(asciiHost);
 
   unescaped = unescaped.replace(this.REs_.FIND_DODGY_CHARS_GLOBAL, "")
               .replace(this.REs_.FIND_END_DOTS_GLOBAL, "")
@@ -248,7 +182,9 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
     unescaped = temp;
 
   // Escape everything that's not alphanumeric, hyphen, or dot.
-  var escaped = this.escapeString_(unescaped);
+  var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"]
+                 .getService(Ci.nsIUrlClassifierUtils);
+  var escaped = urlUtils.escapeHostname(unescaped);
 
   if (opt_maxDots) {
     // Limit the number of dots
@@ -272,27 +208,6 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
   return escaped;
 }
 
-/**
- * URL escapes everything except alphanumerics, - and . (dot).  Specifically,
- * escape everything in the escapeCharmap_ defined in the constructor.  This
- * is a little different than escape, encodeURIComponent, and encodeURI.
- */
-PROT_EnchashDecrypter.prototype.escapeString_ = function(unescaped) {
-  var escaped = '';
-  for (var i = 0; i < unescaped.length; ++i) {
-    if (this.escapeCharmap_.contains(unescaped[i])) {
-      var c = unescaped.charCodeAt(i).toString(16);
-      if (c.length == 1) {
-        c = '0' + c;
-      }
-      escaped += '%' + c;
-    } else {
-      escaped += unescaped[i];
-    }
-  }
-  return escaped;
-}
-
 PROT_EnchashDecrypter.prototype.parseIPAddress_ = function(host) {
   if (host.length <= 15) {
 
diff --git a/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl b/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl
index 0795df7911ec..2e68548c0250 100644
--- a/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl
+++ b/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl
@@ -39,7 +39,7 @@
  * Some utility methods used by the url classifier.
  */
 
-[scriptable, uuid(9afd3add-eadc-409f-a187-e3bf60e47290)]
+[scriptable, uuid(89ea43b0-a23f-4db2-8d23-6d90dc55f67a)]
 interface nsIUrlClassifierUtils : nsISupports
 {
   /**
@@ -54,4 +54,12 @@ interface nsIUrlClassifierUtils : nsISupports
    *          then specially url-encoded)
    */
   ACString canonicalizeURL(in ACString url);
+
+  /**
+   * When canonicalizing hostnames, the final step is to url escape everything that
+   * is not alphanumeric or hyphen or dot.  The existing methods (escape,
+   * encodeURIComponent and encodeURI are close, but not exactly what we want
+   * so we write our own function to do this.
+   */
+  ACString escapeHostname(in ACString hostname);
 };
diff --git a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp
index 2a269abc6a79..5d4a20a26761 100644
--- a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp
+++ b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp
@@ -44,13 +44,26 @@ static char int_to_hex_digit(PRInt32 i)
   return NS_STATIC_CAST(char, ((i < 10) ? (i + '0') : ((i - 10) + 'A')));
 }
 
-
-nsUrlClassifierUtils::nsUrlClassifierUtils()
+nsUrlClassifierUtils::nsUrlClassifierUtils() : mEscapeCharmap(nsnull)
 {
 }
 
+nsresult
+nsUrlClassifierUtils::Init()
+{
+  // Everything but alpha numerics, - and .
+  mEscapeCharmap = new Charmap(0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
+                               0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+  if (!mEscapeCharmap)
+    return NS_ERROR_OUT_OF_MEMORY;
+  return NS_OK;
+}
+
 NS_IMPL_ISUPPORTS1(nsUrlClassifierUtils, nsIUrlClassifierUtils)
 
+/////////////////////////////////////////////////////////////////////////////
+// nsIUrlClassifierUtils
+
 /* ACString canonicalizeURL (in ACString url); */
 NS_IMETHODIMP
 nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retval)
@@ -65,6 +78,30 @@ nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retv
   return NS_OK;
 }
 
+NS_IMETHODIMP
+nsUrlClassifierUtils::EscapeHostname(const nsACString & hostname,
+                                     nsACString & _retval)
+{
+  const char* curChar = hostname.BeginReading();
+  const char* end = hostname.EndReading();
+  while (curChar != end) {
+    unsigned char c = NS_STATIC_CAST(unsigned char, *curChar);
+    if (mEscapeCharmap->Contains(c)) {
+      _retval.Append('%');
+      _retval.Append(int_to_hex_digit(c / 16));
+      _retval.Append(int_to_hex_digit(c % 16));
+    } else {
+      _retval.Append(*curChar);
+    }
+    ++curChar;
+  }
+  
+  return NS_OK;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// non-interface methods
+
 // This function will encode all "special" characters in typical url
 // encoding, that is %hh where h is a valid hex digit.  See the comment in
 // the header file for details.
diff --git a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h
index 9b18b0741dab..d61f3dfbe125 100644
--- a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h
+++ b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h
@@ -37,14 +37,48 @@
 #ifndef nsUrlClassifierUtils_h_
 #define nsUrlClassifierUtils_h_
 
+#include "nsAutoPtr.h"
 #include "nsIUrlClassifierUtils.h"
 
 class nsUrlClassifierUtils : public nsIUrlClassifierUtils
 {
+private:
+  /**
+   * A fast, bit-vector map for ascii characters.
+   *
+   * Internally stores 256 bits in an array of 8 ints.
+   * Does quick bit-flicking to lookup needed characters.
+   */
+  class Charmap
+  {
+  public:
+    Charmap(PRUint32 b0, PRUint32 b1, PRUint32 b2, PRUint32 b3,
+            PRUint32 b4, PRUint32 b5, PRUint32 b6, PRUint32 b7)
+    {
+      mMap[0] = b0; mMap[1] = b1; mMap[2] = b2; mMap[3] = b3;
+      mMap[4] = b4; mMap[5] = b5; mMap[6] = b6; mMap[7] = b7;
+    }
+
+    /**
+     * Do a quick lookup to see if the letter is in the map.
+     */
+    PRBool Contains(unsigned char c) const
+    {
+      return mMap[c >> 5] & (1 << (c & 31));
+    }
+
+  private:
+    // Store the 256 bits in an 8 byte array.
+    PRUint32 mMap[8];
+  };
+
+
 public:
   nsUrlClassifierUtils();
   ~nsUrlClassifierUtils() {}
 
+  nsresult Init();
+
   NS_DECL_ISUPPORTS
   NS_DECL_NSIURLCLASSIFIERUTILS
 
@@ -62,6 +96,8 @@ private:
 
   // Function to tell if we should encode a character.
   PRBool ShouldURLEscape(const unsigned char c) const;
+
+  nsAutoPtr<Charmap> mEscapeCharmap;
 };
 
 #endif // nsUrlClassifierUtils_h_
diff --git a/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml b/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml
index 472ef9bd63c2..dbb1dfaa355d 100644
--- a/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml
+++ b/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml
@@ -235,29 +235,43 @@ for (var key in testing) {
      "parseIPAddress broken on " + key + "(got: " + l.parseIPAddress_(key));
 }
 
-// Test escapeString (bug 368998)
+// Test escapeHostname (bug 368998)
 testing = {
   "asdf!@#$a": "asdf%21%40%23%24a",
   "AB CD 12354": "AB%20CD%2012354",
-  "\1\2\3\4\112\177": "%01%02%03%04J%7f",
-  "<>.AS/-+": "%3c%3e.AS%2f-%2b"
+  "\1\2\3\4\112\177": "%01%02%03%04J%7F",
+  "<>.AS/-+": "%3C%3E.AS%2F-%2B"
 };
+var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"]
+               .getService(Ci.nsIUrlClassifierUtils);
 for (var key in testing) {
-  var out = l.escapeString_(key);
+  var out = urlUtils.escapeHostname(key);
   ok(out === testing[key],
      "escapeString broken on " + key + " (got: " + out + ")");
 }
 
-// escapeCharmap_ should be true for non-alphanumeric, non-hyphen, and
-// non-dot chars
+// Test a really long url (~130k).  getCanonicalHost takes about 55ms
+// on my 2.8ghz machine.
+var long_string = "x";
+for (var i = 0; i < 17; ++i) {
+  long_string += long_string;
+}
+var long_hostname_url = "http://" + long_string + "/foo";
+var startTime = Date.now();
+var out = l.getCanonicalHost(long_hostname_url);
+var endTime = Date.now();
+ok(out == long_string, "getCanonicalHost on long string (" +
+                       (endTime - startTime) + "ms)");
+
+// Verify that each character is escaped properly.
 for (var i = 0; i < 256; ++i) {
   var chr = String.fromCharCode(i);
   if ( (chr.toLowerCase() >= 'a' && chr.toLowerCase() <= 'z') ||
        (chr >= '0' && chr <= '9') ||
        '.' == chr || '-' == chr) {
-    ok(!l.escapeCharmap_.contains(chr), 'failed on ' + i);
+    ok(urlUtils.escapeHostname(chr).length == 1, 'failed on ' + i);
   } else {
-    ok(l.escapeCharmap_.contains(chr), 'failed on ' + i);
+    ok(urlUtils.escapeHostname(chr).length == 3, 'failed on ' + i);
   }
 }
 
@@ -320,6 +334,14 @@ for (var key in testing) {
      "getCanonicalUrl broken on: " + key + "(got: " + l.getCanonicalUrl(key) + ")");
 }
 
+// Test for a really long url.  This 130k url takes about 80ms
+// on my 2.8ghz machine.
+startTime = Date.now();
+out = l.getCanonicalUrl(long_hostname_url);
+endTime = Date.now();
+ok(out == long_hostname_url, "getCanonicalUrl on long string (" +
+                       (endTime - startTime) + "ms)");
+
 // Test getlookupkey
 var testing = {};
 testing["www.google.com"] = "AF5638A09FDDDAFF5B7A6013B1BE69A9";