mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-09 03:15:11 +00:00
bug #64592: r=waterson, sr=vidur. Fix |nsCRT::HashCode| and |nsCRT::BufferHashCode| to use a better hash (in this case, the same scheme as |PL_HashString|).
This commit is contained in:
parent
79b1bcdc60
commit
cd9daaab74
@ -522,47 +522,127 @@ PRUnichar* nsCRT::strndup(const PRUnichar* str, PRUint32 len)
|
||||
return rslt;
|
||||
}
|
||||
|
||||
/**
|
||||
* |nsCRT::HashCode| is identical to |PL_HashString|, which tests
|
||||
* (http://bugzilla.mozilla.org/showattachment.cgi?attach_id=26596)
|
||||
* show to be the best hash among several other choices.
|
||||
*
|
||||
* We re-implement it here rather than calling it for two reasons:
|
||||
* (1) in this interface, we also calculate the length of the
|
||||
* string being hashed; and (2) the narrow and wide and `buffer' versions here
|
||||
* will hash equivalent strings to the same value, e.g., "Hello" and L"Hello".
|
||||
*/
|
||||
PRUint32 nsCRT::HashCode(const char* str, PRUint32* resultingStrLen)
|
||||
{
|
||||
PRUint32 hc = 0, len = 0;
|
||||
if (str) {
|
||||
char ch;
|
||||
while ((ch = *str++) != 0) {
|
||||
// FYI: hc = hc*37 + ch
|
||||
hc = ((hc << 5) + (hc << 2) + hc) + ch;
|
||||
len++;
|
||||
}
|
||||
}
|
||||
if (resultingStrLen)
|
||||
*resultingStrLen = len;
|
||||
return hc;
|
||||
PRUint32 h = 0;
|
||||
const char* s = str;
|
||||
|
||||
unsigned char c;
|
||||
while ( (c = *s++) )
|
||||
h = (h>>28) ^ (h<<4) ^ c;
|
||||
|
||||
if ( resultingStrLen )
|
||||
*resultingStrLen = (s-str)-1;
|
||||
return h;
|
||||
}
|
||||
|
||||
PRUint32 nsCRT::HashCode(const PRUnichar* str, PRUint32* resultingStrLen)
|
||||
{
|
||||
PRUint32 hc = 0, len = 0;
|
||||
if (str) {
|
||||
PRUnichar ch;
|
||||
while ((ch = *str++) != 0) {
|
||||
// FYI: hc = hc*37 + ch
|
||||
hc = ((hc << 5) + (hc << 2) + hc) + ch;
|
||||
len++;
|
||||
}
|
||||
PRUint32 h = 0;
|
||||
const PRUnichar* s = str;
|
||||
|
||||
{
|
||||
PRUint16 W1 = 0; // the first UTF-16 word in a two word tuple
|
||||
PRUint32 U = 0; // the current char as UCS-4
|
||||
int code_length = 0; // the number of bytes in the UTF-8 sequence for the current char
|
||||
|
||||
PRUint16 W;
|
||||
while ( (W = *s++) )
|
||||
{
|
||||
/*
|
||||
* On the fly, decoding from UTF-16 (and/or UCS-2) into UTF-8 as per
|
||||
* http://www.ietf.org/rfc/rfc2781.txt
|
||||
* http://www.ietf.org/rfc/rfc2279.txt
|
||||
*/
|
||||
|
||||
if ( !W1 )
|
||||
{
|
||||
if ( W < 0xD800 || 0xDFFF < W )
|
||||
{
|
||||
U = W;
|
||||
if ( W <= 0x007F )
|
||||
code_length = 1;
|
||||
else if ( W <= 0x07FF )
|
||||
code_length = 2;
|
||||
else
|
||||
code_length = 3;
|
||||
}
|
||||
else if ( /* 0xD800 <= W1 && */ W <= 0xDBFF )
|
||||
W1 = W;
|
||||
}
|
||||
else
|
||||
{
|
||||
// as required by the standard, this code is careful to
|
||||
// throw out illegal sequences
|
||||
|
||||
if ( 0xDC00 <= W && W <= 0xDFFF )
|
||||
{
|
||||
U = PRUint32( (W1&0x03FF)<<10 | (W&0x3FFF) );
|
||||
if ( U <= 0x001FFFFF )
|
||||
code_length = 4;
|
||||
else if ( U <= 0x3FFFFFF )
|
||||
code_length = 5;
|
||||
else
|
||||
code_length = 6;
|
||||
}
|
||||
W1 = 0;
|
||||
}
|
||||
|
||||
|
||||
if ( code_length > 0 )
|
||||
{
|
||||
static const PRUint16 sBytePrefix[7] = { 0x0000, 0x0000, 0x00C0, 0x00E0, 0x00F0, 0x00F8, 0x00FC };
|
||||
static const PRUint16 sShift[7] = { 0, 0, 6, 12, 18, 24, 30 };
|
||||
|
||||
/*
|
||||
* Unlike the algorithm in http://www.ietf.org/rfc/rfc2279.txt
|
||||
* we must calculate the bytes in left to right order so that
|
||||
* our hash result matches what the narrow version would calculate
|
||||
* on an already UTF-8 string.
|
||||
*/
|
||||
|
||||
// hash the first (and often, only, byte)
|
||||
h = (h>>28) ^ (h<<4) ^ (sBytePrefix[code_length] | (U>>sShift[code_length]));
|
||||
|
||||
// an unrolled loop for hashing any remaining bytes in this sequence
|
||||
switch ( code_length )
|
||||
{ // falling through in each case
|
||||
case 6: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>24) & 0x003F));
|
||||
case 5: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>18) & 0x003F));
|
||||
case 4: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>12) & 0x003F));
|
||||
case 3: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>6 ) & 0x003F));
|
||||
case 2: h = (h>>28) ^ (h<<4) ^ (0x80 | ( U & 0x003F));
|
||||
default: code_length = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (resultingStrLen)
|
||||
*resultingStrLen = len;
|
||||
return hc;
|
||||
|
||||
if ( resultingStrLen )
|
||||
*resultingStrLen = (s-str)-1;
|
||||
return h;
|
||||
}
|
||||
|
||||
PRUint32 nsCRT::BufferHashCode(const char* buf, PRUint32 len)
|
||||
PRUint32 nsCRT::BufferHashCode(const char* s, PRUint32 len)
|
||||
{
|
||||
PRUint32 hc = 0;
|
||||
for (PRUint32 i = 0; i < len; i++) {
|
||||
char ch = *buf++;
|
||||
// FYI: hc = hc*37 + ch
|
||||
hc = ((hc << 5) + (hc << 2) + hc) + ch;
|
||||
}
|
||||
return hc;
|
||||
PRUint32 h = 0;
|
||||
const char* done = s + len;
|
||||
|
||||
while ( s < done )
|
||||
h = (h>>28) ^ (h<<4) ^ PRUint8(*s++); // cast to unsigned to prevent possible sign extension
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
PRInt32 nsCRT::atoi( const PRUnichar *string )
|
||||
|
Loading…
Reference in New Issue
Block a user