mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-01-26 06:38:36 +00:00
Bug 368998: when normalizing hostnames, we don't properly escape non-alphanumerics
patch: move host encoding to c++ (url classifier utils component) r=bryner
This commit is contained in:
parent
1955e1a9d0
commit
e7ba49093b
@ -91,7 +91,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsTypeAheadFind)
|
||||
NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsUrlClassifierDBService,
|
||||
nsUrlClassifierDBService::GetInstance)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierStreamUpdater)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUrlClassifierUtils)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR_INIT(nsUrlClassifierUtils, Init)
|
||||
#endif
|
||||
|
||||
#ifdef MOZ_FEEDS
|
||||
|
@ -53,36 +53,6 @@
|
||||
//
|
||||
// TODO: accommodate other kinds of perl-but-not-javascript qualifiers
|
||||
|
||||
/**
|
||||
* A fast, bit-vector map for ascii characters.
|
||||
*
|
||||
* Internally stores 256 bits in an array of 8 ints.
|
||||
* Does quick bit-flicking to lookup needed characters.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param Takes 8 ints to initialize the character map
|
||||
*/
|
||||
function Charmap() {
|
||||
if (arguments.length != 8) {
|
||||
throw G_Error("charmap ctor requires 8 int args");
|
||||
}
|
||||
this.map_ = [];
|
||||
for (var i = 0; i < 8; ++i) {
|
||||
this.map_.push(arguments[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Do a quick lookup to see if the letter is in the map.
|
||||
* @param chr String of length 1 (ascii)
|
||||
* @return Boolean true if the letter is in the map
|
||||
*/
|
||||
Charmap.prototype.contains = function(chr) {
|
||||
var val = chr.charCodeAt(0);
|
||||
return !!(this.map_[val >> 5] & (1 << (val & 31)));
|
||||
}
|
||||
|
||||
/**
|
||||
* This thing knows how to generate lookup keys and decrypt values found in
|
||||
* a table of type enchash.
|
||||
@ -94,10 +64,6 @@ function PROT_EnchashDecrypter() {
|
||||
this.base64_ = new G_Base64();
|
||||
this.streamCipher_ = Cc["@mozilla.org/security/streamcipher;1"]
|
||||
.createInstance(Ci.nsIStreamCipher);
|
||||
// Everything but alpha numerics, - and .
|
||||
this.escapeCharmap_ = new Charmap(
|
||||
0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
|
||||
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
}
|
||||
|
||||
PROT_EnchashDecrypter.DATABASE_SALT = "oU3q.72p";
|
||||
@ -141,38 +107,6 @@ PROT_EnchashDecrypter.prototype.lastNChars_ = function(str, n) {
|
||||
return str.substr(n);
|
||||
}
|
||||
|
||||
/**
|
||||
* We have to have our own hex-decoder because decodeURIComponent
|
||||
* expects UTF-8 (so it will barf on invalid UTF-8 sequences).
|
||||
*
|
||||
* @param str String to decode
|
||||
*
|
||||
* @returns The decoded string
|
||||
*/
|
||||
PROT_EnchashDecrypter.prototype.hexDecode_ = function(str) {
|
||||
var output = [];
|
||||
|
||||
var i = 0;
|
||||
while (i < str.length) {
|
||||
var c = str.charAt(i);
|
||||
|
||||
if (c == "%" && i + 2 < str.length) {
|
||||
|
||||
var asciiVal = Number("0x" + str.charAt(i + 1) + str.charAt(i + 2));
|
||||
|
||||
if (!isNaN(asciiVal)) {
|
||||
i += 2;
|
||||
c = String.fromCharCode(asciiVal);
|
||||
}
|
||||
}
|
||||
|
||||
output[output.length] = c;
|
||||
++i;
|
||||
}
|
||||
|
||||
return output.join("");
|
||||
}
|
||||
|
||||
/**
|
||||
* Translate a plaintext enchash value into regular expressions
|
||||
*
|
||||
@ -237,7 +171,7 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
|
||||
return "";
|
||||
}
|
||||
|
||||
var unescaped = this.hexDecode_(asciiHost);
|
||||
var unescaped = unescape(asciiHost);
|
||||
|
||||
unescaped = unescaped.replace(this.REs_.FIND_DODGY_CHARS_GLOBAL, "")
|
||||
.replace(this.REs_.FIND_END_DOTS_GLOBAL, "")
|
||||
@ -248,7 +182,9 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
|
||||
unescaped = temp;
|
||||
|
||||
// Escape everything that's not alphanumeric, hyphen, or dot.
|
||||
var escaped = this.escapeString_(unescaped);
|
||||
var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"]
|
||||
.getService(Ci.nsIUrlClassifierUtils);
|
||||
var escaped = urlUtils.escapeHostname(unescaped);
|
||||
|
||||
if (opt_maxDots) {
|
||||
// Limit the number of dots
|
||||
@ -272,27 +208,6 @@ PROT_EnchashDecrypter.prototype.getCanonicalHost = function(str, opt_maxDots) {
|
||||
return escaped;
|
||||
}
|
||||
|
||||
/**
|
||||
* URL escapes everything except alphanumerics, - and . (dot). Specifically,
|
||||
* escape everything in the escapeCharmap_ defined in the constructor. This
|
||||
* is a little different than escape, encodeURIComponent, and encodeURI.
|
||||
*/
|
||||
PROT_EnchashDecrypter.prototype.escapeString_ = function(unescaped) {
|
||||
var escaped = '';
|
||||
for (var i = 0; i < unescaped.length; ++i) {
|
||||
if (this.escapeCharmap_.contains(unescaped[i])) {
|
||||
var c = unescaped.charCodeAt(i).toString(16);
|
||||
if (c.length == 1) {
|
||||
c = '0' + c;
|
||||
}
|
||||
escaped += '%' + c;
|
||||
} else {
|
||||
escaped += unescaped[i];
|
||||
}
|
||||
}
|
||||
return escaped;
|
||||
}
|
||||
|
||||
PROT_EnchashDecrypter.prototype.parseIPAddress_ = function(host) {
|
||||
if (host.length <= 15) {
|
||||
|
||||
|
@ -39,7 +39,7 @@
|
||||
* Some utility methods used by the url classifier.
|
||||
*/
|
||||
|
||||
[scriptable, uuid(9afd3add-eadc-409f-a187-e3bf60e47290)]
|
||||
[scriptable, uuid(89ea43b0-a23f-4db2-8d23-6d90dc55f67a)]
|
||||
interface nsIUrlClassifierUtils : nsISupports
|
||||
{
|
||||
/**
|
||||
@ -54,4 +54,12 @@ interface nsIUrlClassifierUtils : nsISupports
|
||||
* then specially url-encoded)
|
||||
*/
|
||||
ACString canonicalizeURL(in ACString url);
|
||||
|
||||
/**
|
||||
* When canonicalizing hostnames, the final step is to url escape everything that
|
||||
* is not alphanumeric or hyphen or dot. The existing methods (escape,
|
||||
* encodeURIComponent and encodeURI are close, but not exactly what we want
|
||||
* so we write our own function to do this.
|
||||
*/
|
||||
ACString escapeHostname(in ACString hostname);
|
||||
};
|
||||
|
@ -44,13 +44,26 @@ static char int_to_hex_digit(PRInt32 i)
|
||||
return NS_STATIC_CAST(char, ((i < 10) ? (i + '0') : ((i - 10) + 'A')));
|
||||
}
|
||||
|
||||
|
||||
nsUrlClassifierUtils::nsUrlClassifierUtils()
|
||||
nsUrlClassifierUtils::nsUrlClassifierUtils() : mEscapeCharmap(nsnull)
|
||||
{
|
||||
}
|
||||
|
||||
nsresult
|
||||
nsUrlClassifierUtils::Init()
|
||||
{
|
||||
// Everything but alpha numerics, - and .
|
||||
mEscapeCharmap = new Charmap(0xffffffff, 0xfc009fff, 0xf8000001, 0xf8000001,
|
||||
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
if (!mEscapeCharmap)
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMPL_ISUPPORTS1(nsUrlClassifierUtils, nsIUrlClassifierUtils)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// nsIUrlClassifierUtils
|
||||
|
||||
/* ACString canonicalizeURL (in ACString url); */
|
||||
NS_IMETHODIMP
|
||||
nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retval)
|
||||
@ -65,6 +78,30 @@ nsUrlClassifierUtils::CanonicalizeURL(const nsACString & url, nsACString & _retv
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUrlClassifierUtils::EscapeHostname(const nsACString & hostname,
|
||||
nsACString & _retval)
|
||||
{
|
||||
const char* curChar = hostname.BeginReading();
|
||||
const char* end = hostname.EndReading();
|
||||
while (curChar != end) {
|
||||
unsigned char c = NS_STATIC_CAST(unsigned char, *curChar);
|
||||
if (mEscapeCharmap->Contains(c)) {
|
||||
_retval.Append('%');
|
||||
_retval.Append(int_to_hex_digit(c / 16));
|
||||
_retval.Append(int_to_hex_digit(c % 16));
|
||||
} else {
|
||||
_retval.Append(*curChar);
|
||||
}
|
||||
++curChar;
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// non-interface methods
|
||||
|
||||
// This function will encode all "special" characters in typical url
|
||||
// encoding, that is %hh where h is a valid hex digit. See the comment in
|
||||
// the header file for details.
|
||||
|
@ -37,14 +37,48 @@
|
||||
#ifndef nsUrlClassifierUtils_h_
|
||||
#define nsUrlClassifierUtils_h_
|
||||
|
||||
#include "nsAutoPtr.h"
|
||||
#include "nsIUrlClassifierUtils.h"
|
||||
|
||||
class nsUrlClassifierUtils : public nsIUrlClassifierUtils
|
||||
{
|
||||
private:
|
||||
/**
|
||||
* A fast, bit-vector map for ascii characters.
|
||||
*
|
||||
* Internally stores 256 bits in an array of 8 ints.
|
||||
* Does quick bit-flicking to lookup needed characters.
|
||||
*/
|
||||
class Charmap
|
||||
{
|
||||
public:
|
||||
Charmap(PRUint32 b0, PRUint32 b1, PRUint32 b2, PRUint32 b3,
|
||||
PRUint32 b4, PRUint32 b5, PRUint32 b6, PRUint32 b7)
|
||||
{
|
||||
mMap[0] = b0; mMap[1] = b1; mMap[2] = b2; mMap[3] = b3;
|
||||
mMap[4] = b4; mMap[5] = b5; mMap[6] = b6; mMap[7] = b7;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do a quick lookup to see if the letter is in the map.
|
||||
*/
|
||||
PRBool Contains(unsigned char c) const
|
||||
{
|
||||
return mMap[c >> 5] & (1 << (c & 31));
|
||||
}
|
||||
|
||||
private:
|
||||
// Store the 256 bits in an 8 byte array.
|
||||
PRUint32 mMap[8];
|
||||
};
|
||||
|
||||
|
||||
public:
|
||||
nsUrlClassifierUtils();
|
||||
~nsUrlClassifierUtils() {}
|
||||
|
||||
nsresult Init();
|
||||
|
||||
NS_DECL_ISUPPORTS
|
||||
NS_DECL_NSIURLCLASSIFIERUTILS
|
||||
|
||||
@ -62,6 +96,8 @@ private:
|
||||
|
||||
// Function to tell if we should encode a character.
|
||||
PRBool ShouldURLEscape(const unsigned char c) const;
|
||||
|
||||
nsAutoPtr<Charmap> mEscapeCharmap;
|
||||
};
|
||||
|
||||
#endif // nsUrlClassifierUtils_h_
|
||||
|
@ -235,29 +235,43 @@ for (var key in testing) {
|
||||
"parseIPAddress broken on " + key + "(got: " + l.parseIPAddress_(key));
|
||||
}
|
||||
|
||||
// Test escapeString (bug 368998)
|
||||
// Test escapeHostname (bug 368998)
|
||||
testing = {
|
||||
"asdf!@#$a": "asdf%21%40%23%24a",
|
||||
"AB CD 12354": "AB%20CD%2012354",
|
||||
"\1\2\3\4\112\177": "%01%02%03%04J%7f",
|
||||
"<>.AS/-+": "%3c%3e.AS%2f-%2b"
|
||||
"\1\2\3\4\112\177": "%01%02%03%04J%7F",
|
||||
"<>.AS/-+": "%3C%3E.AS%2F-%2B"
|
||||
};
|
||||
var urlUtils = Cc["@mozilla.org/url-classifier/utils;1"]
|
||||
.getService(Ci.nsIUrlClassifierUtils);
|
||||
for (var key in testing) {
|
||||
var out = l.escapeString_(key);
|
||||
var out = urlUtils.escapeHostname(key);
|
||||
ok(out === testing[key],
|
||||
"escapeString broken on " + key + " (got: " + out + ")");
|
||||
}
|
||||
|
||||
// escapeCharmap_ should be true for non-alphanumeric, non-hyphen, and
|
||||
// non-dot chars
|
||||
// Test a really long url (~130k). getCanonicalHost takes about 55ms
|
||||
// on my 2.8ghz machine.
|
||||
var long_string = "x";
|
||||
for (var i = 0; i < 17; ++i) {
|
||||
long_string += long_string;
|
||||
}
|
||||
var long_hostname_url = "http://" + long_string + "/foo";
|
||||
var startTime = Date.now();
|
||||
var out = l.getCanonicalHost(long_hostname_url);
|
||||
var endTime = Date.now();
|
||||
ok(out == long_string, "getCanonicalHost on long string (" +
|
||||
(endTime - startTime) + "ms)");
|
||||
|
||||
// Verify that each character is escaped properly.
|
||||
for (var i = 0; i < 256; ++i) {
|
||||
var chr = String.fromCharCode(i);
|
||||
if ( (chr.toLowerCase() >= 'a' && chr.toLowerCase() <= 'z') ||
|
||||
(chr >= '0' && chr <= '9') ||
|
||||
'.' == chr || '-' == chr) {
|
||||
ok(!l.escapeCharmap_.contains(chr), 'failed on ' + i);
|
||||
ok(urlUtils.escapeHostname(chr).length == 1, 'failed on ' + i);
|
||||
} else {
|
||||
ok(l.escapeCharmap_.contains(chr), 'failed on ' + i);
|
||||
ok(urlUtils.escapeHostname(chr).length == 3, 'failed on ' + i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -320,6 +334,14 @@ for (var key in testing) {
|
||||
"getCanonicalUrl broken on: " + key + "(got: " + l.getCanonicalUrl(key) + ")");
|
||||
}
|
||||
|
||||
// Test for a really long url. This 130k url takes about 80ms
|
||||
// on my 2.8ghz machine.
|
||||
startTime = Date.now();
|
||||
out = l.getCanonicalUrl(long_hostname_url);
|
||||
endTime = Date.now();
|
||||
ok(out == long_hostname_url, "getCanonicalUrl on long string (" +
|
||||
(endTime - startTime) + "ms)");
|
||||
|
||||
// Test getlookupkey
|
||||
var testing = {};
|
||||
testing["www.google.com"] = "AF5638A09FDDDAFF5B7A6013B1BE69A9";
|
||||
|
Loading…
x
Reference in New Issue
Block a user