From e7ba49093b3b35fa2253c5438cd2ccf5dae445b5 Mon Sep 17 00:00:00 2001 From: "tony%ponderer.org" Date: Mon, 5 Mar 2007 05:58:05 +0000 Subject: [PATCH] Bug 368998: when normalizing hostnames, we don't properly escape non-alphanumerics patch: move host encoding to c++ (url classifier utils component) r=bryner --- .../components/build/nsToolkitCompsModule.cpp | 2 +- .../content/enchash-decrypter.js | 93 +------------------ .../public/nsIUrlClassifierUtils.idl | 10 +- .../src/nsUrlClassifierUtils.cpp | 41 +++++++- .../url-classifier/src/nsUrlClassifierUtils.h | 36 +++++++ .../tests/test_enchash-decrypter.xhtml | 38 ++++++-- 6 files changed, 119 insertions(+), 101 deletions(-) diff --git a/toolkit/components/build/nsToolkitCompsModule.cpp b/toolkit/components/build/nsToolkitCompsModule.cpp index 32358a399b2a..80e473a15174 100644 --- a/toolkit/components/build/nsToolkitCompsModule.cpp +++ b/toolkit/components/build/nsToolkitCompsModule.cpp @@ -91,7 +91,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsTypeAheadFind) NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsUrlClassifierDBService, nsUrlClassifierDBService::GetInstance) NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierStreamUpdater) -NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierUtils) +NS_GENERIC_FACTORY_CONSTRUCTOR_INIT(nsUrlClassifierUtils, Init) #endif #ifdef MOZ_FEEDS diff --git a/toolkit/components/url-classifier/content/enchash-decrypter.js b/toolkit/components/url-classifier/content/enchash-decrypter.js index bfdda06e0b47..c5643e5299f1 100644 --- a/toolkit/components/url-classifier/content/enchash-decrypter.js +++ b/toolkit/components/url-classifier/content/enchash-decrypter.js @@ -53,36 +53,6 @@ // // TODO: accommodate other kinds of perl-but-not-javascript qualifiers -/** - * A fast, bit-vector map for ascii characters. - * - * Internally stores 256 bits in an array of 8 ints. - * Does quick bit-flicking to lookup needed characters. - */ - -/** - * @param Takes 8 ints to initialize the character map - */ -function Charmap() { - if (arguments.length != 8) { - throw G_Error("charmap ctor requires 8 int args"); - } - this.map_ = []; - for (var i = 0; i < 8; ++i) { - this.map_.push(arguments[i]); - } -} - -/** - * Do a quick lookup to see if the letter is in the map. - * @param chr String of length 1 (ascii) - * @return Boolean true if the letter is in the map - */ -Charmap.prototype.contains = function(chr) { - var val = chr.charCodeAt(0); - return !!(this.map_[val >> 5] & (1 << (val & 31))); -} - /** * This thing knows how to generate lookup keys and decrypt values found in * a table of type enchash. @@ -94,10 +64,6 @@ function PROT_EnchashDecrypter() { this.base64_ = new G_Base64(); this.streamCipher_ = Cc["@mozilla.org/security/streamcipher;1"] .createInstance(Ci.nsIStreamCipher); - // Everything but alpha numerics, - and . - this.escapeCharmap_ = new Charmap( - 0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001, - 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); } PROT_EnchashDecrypter.DATABASE_SALT = "oU3q.72p"; @@ -141,38 +107,6 @@ PROT_EnchashDecrypter.prototype.lastNChars_ = function(str, n) { return str.substr(n); } -/** - * We have to have our own hex-decoder because decodeURIComponent - * expects UTF-8 (so it will barf on invalid UTF-8 sequences). - * - * @param str String to decode - * - * @returns The decoded string - */ -PROT_EnchashDecrypter.prototype.hexDecode_ = function(str) { - var output = []; - - var i = 0; - while (i < str.length) { - var c = str.charAt(i); - - if (c == "%" && i + 2 < str.length) { - - var asciiVal = Number("0x" + str.charAt(i + 1) + str.charAt(i + 2)); - - if (!isNaN(asciiVal)) { - i += 2; - c = String.fromCharCode(asciiVal); - } - } - - output[output.length] = c; - ++i; - } - - return output.join(""); -} - /** * Translate a plaintext enchash value into regular expressions * @@ -237,7 +171,7 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) { return ""; } - var unescaped = this.hexDecode_(asciiHost); + var unescaped = unescape(asciiHost); unescaped = unescaped.replace(this.REs_.FIND_DODGY_CHARS_GLOBAL, "") .replace(this.REs_.FIND_END_DOTS_GLOBAL, "") @@ -248,7 +182,9 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) { unescaped = temp; // Escape everything that's not alphanumeric, hyphen, or dot. - var escaped = this.escapeString_(unescaped); + var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"] + .getService(Ci.nsIUrlClassifierUtils); + var escaped = urlUtils.escapeHostname(unescaped); if (opt_maxDots) { // Limit the number of dots @@ -272,27 +208,6 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) { return escaped; } -/** - * URL escapes everything except alphanumerics, - and . (dot). Specifically, - * escape everything in the escapeCharmap_ defined in the constructor. This - * is a little different than escape, encodeURIComponent, and encodeURI. - */ -PROT_EnchashDecrypter.prototype.escapeString_ = function(unescaped) { - var escaped = ''; - for (var i = 0; i < unescaped.length; ++i) { - if (this.escapeCharmap_.contains(unescaped[i])) { - var c = unescaped.charCodeAt(i).toString(16); - if (c.length == 1) { - c = '0' + c; - } - escaped += '%' + c; - } else { - escaped += unescaped[i]; - } - } - return escaped; -} - PROT_EnchashDecrypter.prototype.parseIPAddress_ = function(host) { if (host.length <= 15) { diff --git a/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl b/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl index 0795df7911ec..2e68548c0250 100644 --- a/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl +++ b/toolkit/components/url-classifier/public/nsIUrlClassifierUtils.idl @@ -39,7 +39,7 @@ * Some utility methods used by the url classifier. */ -[scriptable, uuid(9afd3add-eadc-409f-a187-e3bf60e47290)] +[scriptable, uuid(89ea43b0-a23f-4db2-8d23-6d90dc55f67a)] interface nsIUrlClassifierUtils : nsISupports { /** @@ -54,4 +54,12 @@ interface nsIUrlClassifierUtils : nsISupports * then specially url-encoded) */ ACString canonicalizeURL(in ACString url); + + /** + * When canonicalizing hostnames, the final step is to url escape everything that + * is not alphanumeric or hyphen or dot. The existing methods (escape, + * encodeURIComponent and encodeURI are close, but not exactly what we want + * so we write our own function to do this. + */ + ACString escapeHostname(in ACString hostname); }; diff --git a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp index 2a269abc6a79..5d4a20a26761 100644 --- a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp +++ b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.cpp @@ -44,13 +44,26 @@ static char int_to_hex_digit(PRInt32 i) return NS_STATIC_CAST(char, ((i < 10) ? (i + '0') : ((i - 10) + 'A'))); } - -nsUrlClassifierUtils::nsUrlClassifierUtils() +nsUrlClassifierUtils::nsUrlClassifierUtils() : mEscapeCharmap(nsnull) { } +nsresult +nsUrlClassifierUtils::Init() +{ + // Everything but alpha numerics, - and . + mEscapeCharmap = new Charmap(0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); + if (!mEscapeCharmap) + return NS_ERROR_OUT_OF_MEMORY; + return NS_OK; +} + NS_IMPL_ISUPPORTS1(nsUrlClassifierUtils, nsIUrlClassifierUtils) +///////////////////////////////////////////////////////////////////////////// +// nsIUrlClassifierUtils + /* ACString canonicalizeURL (in ACString url); */ NS_IMETHODIMP nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retval) @@ -65,6 +78,30 @@ nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retv return NS_OK; } +NS_IMETHODIMP +nsUrlClassifierUtils::EscapeHostname(const nsACString & hostname, + nsACString & _retval) +{ + const char* curChar = hostname.BeginReading(); + const char* end = hostname.EndReading(); + while (curChar != end) { + unsigned char c = NS_STATIC_CAST(unsigned char, *curChar); + if (mEscapeCharmap->Contains(c)) { + _retval.Append('%'); + _retval.Append(int_to_hex_digit(c / 16)); + _retval.Append(int_to_hex_digit(c % 16)); + } else { + _retval.Append(*curChar); + } + ++curChar; + } + + return NS_OK; +} + +///////////////////////////////////////////////////////////////////////////// +// non-interface methods + // This function will encode all "special" characters in typical url // encoding, that is %hh where h is a valid hex digit. See the comment in // the header file for details. diff --git a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h index 9b18b0741dab..d61f3dfbe125 100644 --- a/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h +++ b/toolkit/components/url-classifier/src/nsUrlClassifierUtils.h @@ -37,14 +37,48 @@ #ifndef nsUrlClassifierUtils_h_ #define nsUrlClassifierUtils_h_ +#include "nsAutoPtr.h" #include "nsIUrlClassifierUtils.h" class nsUrlClassifierUtils : public nsIUrlClassifierUtils { +private: + /** + * A fast, bit-vector map for ascii characters. + * + * Internally stores 256 bits in an array of 8 ints. + * Does quick bit-flicking to lookup needed characters. + */ + class Charmap + { + public: + Charmap(PRUint32 b0, PRUint32 b1, PRUint32 b2, PRUint32 b3, + PRUint32 b4, PRUint32 b5, PRUint32 b6, PRUint32 b7) + { + mMap[0] = b0; mMap[1] = b1; mMap[2] = b2; mMap[3] = b3; + mMap[4] = b4; mMap[5] = b5; mMap[6] = b6; mMap[7] = b7; + } + + /** + * Do a quick lookup to see if the letter is in the map. + */ + PRBool Contains(unsigned char c) const + { + return mMap[c >> 5] & (1 << (c & 31)); + } + + private: + // Store the 256 bits in an 8 byte array. + PRUint32 mMap[8]; + }; + + public: nsUrlClassifierUtils(); ~nsUrlClassifierUtils() {} + nsresult Init(); + NS_DECL_ISUPPORTS NS_DECL_NSIURLCLASSIFIERUTILS @@ -62,6 +96,8 @@ private: // Function to tell if we should encode a character. PRBool ShouldURLEscape(const unsigned char c) const; + + nsAutoPtr mEscapeCharmap; }; #endif // nsUrlClassifierUtils_h_ diff --git a/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml b/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml index 472ef9bd63c2..dbb1dfaa355d 100644 --- a/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml +++ b/toolkit/components/url-classifier/tests/test_enchash-decrypter.xhtml @@ -235,29 +235,43 @@ for (var key in testing) { "parseIPAddress broken on " + key + "(got: " + l.parseIPAddress_(key)); } -// Test escapeString (bug 368998) +// Test escapeHostname (bug 368998) testing = { "asdf!@#$a": "asdf%21%40%23%24a", "AB CD 12354": "AB%20CD%2012354", - "\1\2\3\4\112\177": "%01%02%03%04J%7f", - "<>.AS/-+": "%3c%3e.AS%2f-%2b" + "\1\2\3\4\112\177": "%01%02%03%04J%7F", + "<>.AS/-+": "%3C%3E.AS%2F-%2B" }; +var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"] + .getService(Ci.nsIUrlClassifierUtils); for (var key in testing) { - var out = l.escapeString_(key); + var out = urlUtils.escapeHostname(key); ok(out === testing[key], "escapeString broken on " + key + " (got: " + out + ")"); } -// escapeCharmap_ should be true for non-alphanumeric, non-hyphen, and -// non-dot chars +// Test a really long url (~130k). getCanonicalHost takes about 55ms +// on my 2.8ghz machine. +var long_string = "x"; +for (var i = 0; i < 17; ++i) { + long_string += long_string; +} +var long_hostname_url = "http://" + long_string + "/foo"; +var startTime = Date.now(); +var out = l.getCanonicalHost(long_hostname_url); +var endTime = Date.now(); +ok(out == long_string, "getCanonicalHost on long string (" + + (endTime - startTime) + "ms)"); + +// Verify that each character is escaped properly. for (var i = 0; i < 256; ++i) { var chr = String.fromCharCode(i); if ( (chr.toLowerCase() >= 'a' && chr.toLowerCase() <= 'z') || (chr >= '0' && chr <= '9') || '.' == chr || '-' == chr) { - ok(!l.escapeCharmap_.contains(chr), 'failed on ' + i); + ok(urlUtils.escapeHostname(chr).length == 1, 'failed on ' + i); } else { - ok(l.escapeCharmap_.contains(chr), 'failed on ' + i); + ok(urlUtils.escapeHostname(chr).length == 3, 'failed on ' + i); } } @@ -320,6 +334,14 @@ for (var key in testing) { "getCanonicalUrl broken on: " + key + "(got: " + l.getCanonicalUrl(key) + ")"); } +// Test for a really long url. This 130k url takes about 80ms +// on my 2.8ghz machine. +startTime = Date.now(); +out = l.getCanonicalUrl(long_hostname_url); +endTime = Date.now(); +ok(out == long_hostname_url, "getCanonicalUrl on long string (" + + (endTime - startTime) + "ms)"); + // Test getlookupkey var testing = {}; testing["www.google.com"] = "AF5638A09FDDDAFF5B7A6013B1BE69A9";