Bug 368998: when normalizing hostnames, we don't properly escape non-alphanumerics

patch: move host encoding to c++ (url classifier utils component)
r=bryner
This commit is contained in:
tony%ponderer.org 2007-03-05 05:58:05 +00:00
parent 1955e1a9d0
commit e7ba49093b
6 changed files with 119 additions and 101 deletions

View File

@ -91,7 +91,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsTypeAheadFind)
NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsUrlClassifierDBService,
nsUrlClassifierDBService::GetInstance)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierStreamUpdater)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierUtils)
NS_GENERIC_FACTORY_CONSTRUCTOR_INIT(nsUrlClassifierUtils, Init)
#endif
#ifdef MOZ_FEEDS

View File

@ -53,36 +53,6 @@
//
// TODO: accommodate other kinds of perl-but-not-javascript qualifiers
/**
* A fast, bit-vector map for ascii characters.
*
* Internally stores 256 bits in an array of 8 ints.
* Does quick bit-flicking to lookup needed characters.
*/
/**
* @param Takes 8 ints to initialize the character map
*/
function Charmap() {
if (arguments.length != 8) {
throw G_Error("charmap ctor requires 8 int args");
}
this.map_ = [];
for (var i = 0; i < 8; ++i) {
this.map_.push(arguments[i]);
}
}
/**
* Do a quick lookup to see if the letter is in the map.
* @param chr String of length 1 (ascii)
* @return Boolean true if the letter is in the map
*/
Charmap.prototype.contains = function(chr) {
var val = chr.charCodeAt(0);
return !!(this.map_[val >> 5] & (1 << (val & 31)));
}
/**
* This thing knows how to generate lookup keys and decrypt values found in
* a table of type enchash.
@ -94,10 +64,6 @@ function PROT_EnchashDecrypter() {
this.base64_ = new G_Base64();
this.streamCipher_ = Cc["@mozilla.org/security/streamcipher;1"]
.createInstance(Ci.nsIStreamCipher);
// Everything but alpha numerics, - and .
this.escapeCharmap_ = new Charmap(
0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
}
PROT_EnchashDecrypter.DATABASE_SALT = "oU3q.72p";
@ -141,38 +107,6 @@ PROT_EnchashDecrypter.prototype.lastNChars_ = function(str, n) {
return str.substr(n);
}
/**
* We have to have our own hex-decoder because decodeURIComponent
* expects UTF-8 (so it will barf on invalid UTF-8 sequences).
*
* @param str String to decode
*
* @returns The decoded string
*/
PROT_EnchashDecrypter.prototype.hexDecode_ = function(str) {
var output = [];
var i = 0;
while (i < str.length) {
var c = str.charAt(i);
if (c == "%" && i + 2 < str.length) {
var asciiVal = Number("0x" + str.charAt(i + 1) + str.charAt(i + 2));
if (!isNaN(asciiVal)) {
i += 2;
c = String.fromCharCode(asciiVal);
}
}
output[output.length] = c;
++i;
}
return output.join("");
}
/**
* Translate a plaintext enchash value into regular expressions
*
@ -237,7 +171,7 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
return "";
}
var unescaped = this.hexDecode_(asciiHost);
var unescaped = unescape(asciiHost);
unescaped = unescaped.replace(this.REs_.FIND_DODGY_CHARS_GLOBAL, "")
.replace(this.REs_.FIND_END_DOTS_GLOBAL, "")
@ -248,7 +182,9 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
unescaped = temp;
// Escape everything that's not alphanumeric, hyphen, or dot.
var escaped = this.escapeString_(unescaped);
var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"]
.getService(Ci.nsIUrlClassifierUtils);
var escaped = urlUtils.escapeHostname(unescaped);
if (opt_maxDots) {
// Limit the number of dots
@ -272,27 +208,6 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
return escaped;
}
/**
* URL escapes everything except alphanumerics, - and . (dot). Specifically,
* escape everything in the escapeCharmap_ defined in the constructor. This
* is a little different than escape, encodeURIComponent, and encodeURI.
*/
PROT_EnchashDecrypter.prototype.escapeString_ = function(unescaped) {
var escaped = '';
for (var i = 0; i < unescaped.length; ++i) {
if (this.escapeCharmap_.contains(unescaped[i])) {
var c = unescaped.charCodeAt(i).toString(16);
if (c.length == 1) {
c = '0' + c;
}
escaped += '%' + c;
} else {
escaped += unescaped[i];
}
}
return escaped;
}
PROT_EnchashDecrypter.prototype.parseIPAddress_ = function(host) {
if (host.length <= 15) {

View File

@ -39,7 +39,7 @@
* Some utility methods used by the url classifier.
*/
[scriptable, uuid(9afd3add-eadc-409f-a187-e3bf60e47290)]
[scriptable, uuid(89ea43b0-a23f-4db2-8d23-6d90dc55f67a)]
interface nsIUrlClassifierUtils : nsISupports
{
/**
@ -54,4 +54,12 @@ interface nsIUrlClassifierUtils : nsISupports
* then specially url-encoded)
*/
ACString canonicalizeURL(in ACString url);
/**
* When canonicalizing hostnames, the final step is to url escape everything that
* is not alphanumeric or hyphen or dot. The existing methods (escape,
* encodeURIComponent and encodeURI are close, but not exactly what we want
* so we write our own function to do this.
*/
ACString escapeHostname(in ACString hostname);
};

View File

@ -44,13 +44,26 @@ static char int_to_hex_digit(PRInt32 i)
return NS_STATIC_CAST(char, ((i < 10) ? (i + '0') : ((i - 10) + 'A')));
}
nsUrlClassifierUtils::nsUrlClassifierUtils()
nsUrlClassifierUtils::nsUrlClassifierUtils() : mEscapeCharmap(nsnull)
{
}
nsresult
nsUrlClassifierUtils::Init()
{
// Everything but alpha numerics, - and .
mEscapeCharmap = new Charmap(0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
if (!mEscapeCharmap)
return NS_ERROR_OUT_OF_MEMORY;
return NS_OK;
}
NS_IMPL_ISUPPORTS1(nsUrlClassifierUtils, nsIUrlClassifierUtils)
/////////////////////////////////////////////////////////////////////////////
// nsIUrlClassifierUtils
/* ACString canonicalizeURL (in ACString url); */
NS_IMETHODIMP
nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retval)
@ -65,6 +78,30 @@ nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retv
return NS_OK;
}
NS_IMETHODIMP
nsUrlClassifierUtils::EscapeHostname(const nsACString & hostname,
nsACString & _retval)
{
const char* curChar = hostname.BeginReading();
const char* end = hostname.EndReading();
while (curChar != end) {
unsigned char c = NS_STATIC_CAST(unsigned char, *curChar);
if (mEscapeCharmap->Contains(c)) {
_retval.Append('%');
_retval.Append(int_to_hex_digit(c / 16));
_retval.Append(int_to_hex_digit(c % 16));
} else {
_retval.Append(*curChar);
}
++curChar;
}
return NS_OK;
}
/////////////////////////////////////////////////////////////////////////////
// non-interface methods
// This function will encode all "special" characters in typical url
// encoding, that is %hh where h is a valid hex digit. See the comment in
// the header file for details.

View File

@ -37,14 +37,48 @@
#ifndef nsUrlClassifierUtils_h_
#define nsUrlClassifierUtils_h_
#include "nsAutoPtr.h"
#include "nsIUrlClassifierUtils.h"
class nsUrlClassifierUtils : public nsIUrlClassifierUtils
{
private:
/**
* A fast, bit-vector map for ascii characters.
*
* Internally stores 256 bits in an array of 8 ints.
* Does quick bit-flicking to lookup needed characters.
*/
class Charmap
{
public:
Charmap(PRUint32 b0, PRUint32 b1, PRUint32 b2, PRUint32 b3,
PRUint32 b4, PRUint32 b5, PRUint32 b6, PRUint32 b7)
{
mMap[0] = b0; mMap[1] = b1; mMap[2] = b2; mMap[3] = b3;
mMap[4] = b4; mMap[5] = b5; mMap[6] = b6; mMap[7] = b7;
}
/**
* Do a quick lookup to see if the letter is in the map.
*/
PRBool Contains(unsigned char c) const
{
return mMap[c >> 5] & (1 << (c & 31));
}
private:
// Store the 256 bits in an 8 byte array.
PRUint32 mMap[8];
};
public:
nsUrlClassifierUtils();
~nsUrlClassifierUtils() {}
nsresult Init();
NS_DECL_ISUPPORTS
NS_DECL_NSIURLCLASSIFIERUTILS
@ -62,6 +96,8 @@ private:
// Function to tell if we should encode a character.
PRBool ShouldURLEscape(const unsigned char c) const;
nsAutoPtr<Charmap> mEscapeCharmap;
};
#endif // nsUrlClassifierUtils_h_

View File

@ -235,29 +235,43 @@ for (var key in testing) {
"parseIPAddress broken on " + key + "(got: " + l.parseIPAddress_(key));
}
// Test escapeString (bug 368998)
// Test escapeHostname (bug 368998)
testing = {
"asdf!@#$a": "asdf%21%40%23%24a",
"AB CD 12354": "AB%20CD%2012354",
"\1\2\3\4\112\177": "%01%02%03%04J%7f",
"<>.AS/-+": "%3c%3e.AS%2f-%2b"
"\1\2\3\4\112\177": "%01%02%03%04J%7F",
"<>.AS/-+": "%3C%3E.AS%2F-%2B"
};
var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"]
.getService(Ci.nsIUrlClassifierUtils);
for (var key in testing) {
var out = l.escapeString_(key);
var out = urlUtils.escapeHostname(key);
ok(out === testing[key],
"escapeString broken on " + key + " (got: " + out + ")");
}
// escapeCharmap_ should be true for non-alphanumeric, non-hyphen, and
// non-dot chars
// Test a really long url (~130k). getCanonicalHost takes about 55ms
// on my 2.8ghz machine.
var long_string = "x";
for (var i = 0; i < 17; ++i) {
long_string += long_string;
}
var long_hostname_url = "http://" + long_string + "/foo";
var startTime = Date.now();
var out = l.getCanonicalHost(long_hostname_url);
var endTime = Date.now();
ok(out == long_string, "getCanonicalHost on long string (" +
(endTime - startTime) + "ms)");
// Verify that each character is escaped properly.
for (var i = 0; i < 256; ++i) {
var chr = String.fromCharCode(i);
if ( (chr.toLowerCase() >= 'a' && chr.toLowerCase() <= 'z') ||
(chr >= '0' && chr <= '9') ||
'.' == chr || '-' == chr) {
ok(!l.escapeCharmap_.contains(chr), 'failed on ' + i);
ok(urlUtils.escapeHostname(chr).length == 1, 'failed on ' + i);
} else {
ok(l.escapeCharmap_.contains(chr), 'failed on ' + i);
ok(urlUtils.escapeHostname(chr).length == 3, 'failed on ' + i);
}
}
@ -320,6 +334,14 @@ for (var key in testing) {
"getCanonicalUrl broken on: " + key + "(got: " + l.getCanonicalUrl(key) + ")");
}
// Test for a really long url. This 130k url takes about 80ms
// on my 2.8ghz machine.
startTime = Date.now();
out = l.getCanonicalUrl(long_hostname_url);
endTime = Date.now();
ok(out == long_hostname_url, "getCanonicalUrl on long string (" +
(endTime - startTime) + "ms)");
// Test getlookupkey
var testing = {};
testing["www.google.com"] = "AF5638A09FDDDAFF5B7A6013B1BE69A9";