bug #64592: r=waterson, sr=vidur. Fix |nsCRT::HashCode| and |nsCRT::BufferHashCode| to use a better hash (in this case, the same scheme as |PL_HashString|).

2024-10-09 03:15:11 +00:00 · 2001-03-11 02:15:36 +00:00 · 2001-03-11 02:15:36 +00:00 · cd9daaab74
commit cd9daaab74
parent 79b1bcdc60
1 changed files with 111 additions and 31 deletions
--- a/xpcom/ds/nsCRT.cpp
+++ b/xpcom/ds/nsCRT.cpp
@ -522,47 +522,127 @@ PRUnichar* nsCRT::strndup(const PRUnichar* str, PRUint32 len)
  return rslt;
 }

+  /**
+   * |nsCRT::HashCode| is identical to |PL_HashString|, which tests
+   *  (http://bugzilla.mozilla.org/showattachment.cgi?attach_id=26596)
+   *  show to be the best hash among several other choices.
+   *
+   * We re-implement it here rather than calling it for two reasons:
+   *  (1) in this interface, we also calculate the length of the
+   *  string being hashed; and (2) the narrow and wide and `buffer' versions here
+   *  will hash equivalent strings to the same value, e.g., "Hello" and L"Hello".
+   */
 PRUint32 nsCRT::HashCode(const char* str, PRUint32* resultingStrLen)
 {
-  PRUint32 hc = 0, len = 0;
-  if (str) {
-    char ch;
-    while ((ch = *str++) != 0) {
-      // FYI: hc = hc*37 + ch
-      hc = ((hc << 5) + (hc << 2) + hc) + ch;
-      len++;
-    }
-  }
-  if (resultingStrLen)
-    *resultingStrLen = len;
-  return hc;
+  PRUint32 h = 0;
+  const char* s = str;
+
+  unsigned char c;
+  while ( (c = *s++) )
+    h = (h>>28) ^ (h<<4) ^ c;
+
+  if ( resultingStrLen )
+    *resultingStrLen = (s-str)-1;
+  return h;
 }

 PRUint32 nsCRT::HashCode(const PRUnichar* str, PRUint32* resultingStrLen)
 {
-  PRUint32 hc = 0, len = 0;
-  if (str) {
-    PRUnichar ch;
-    while ((ch = *str++) != 0) {
-      // FYI: hc = hc*37 + ch
-      hc = ((hc << 5) + (hc << 2) + hc) + ch;
-      len++;
-    }
+  PRUint32 h = 0;
+  const PRUnichar* s = str;
+
+  {
+    PRUint16 W1 = 0;      // the first UTF-16 word in a two word tuple
+    PRUint32 U = 0;       // the current char as UCS-4
+    int code_length = 0;  // the number of bytes in the UTF-8 sequence for the current char
+
+    PRUint16 W;
+    while ( (W = *s++) )
+      {
+          /*
+           * On the fly, decoding from UTF-16 (and/or UCS-2) into UTF-8 as per
+           *  http://www.ietf.org/rfc/rfc2781.txt
+           *  http://www.ietf.org/rfc/rfc2279.txt
+           */
+
+        if ( !W1 )
+          {
+            if ( W < 0xD800 || 0xDFFF < W )
+              {
+                U = W;
+                if ( W <= 0x007F )
+                  code_length = 1;
+                else if ( W <= 0x07FF )
+                  code_length = 2;
+                else
+                  code_length = 3;
+              }
+            else if ( /* 0xD800 <= W1 && */ W <= 0xDBFF )
+              W1 = W;
+          }
+        else
+          {
+              // as required by the standard, this code is careful to
+              //  throw out illegal sequences
+
+            if ( 0xDC00 <= W && W <= 0xDFFF )
+              {
+                U = PRUint32( (W1&0x03FF)<<10 | (W&0x3FFF) );
+                if ( U <= 0x001FFFFF )
+                  code_length = 4;
+                else if ( U <= 0x3FFFFFF )
+                  code_length = 5;
+                else
+                  code_length = 6;
+              }
+            W1 = 0;
+          }
+
+
+        if ( code_length > 0 )
+          {
+            static const PRUint16 sBytePrefix[7]  = { 0x0000, 0x0000, 0x00C0, 0x00E0, 0x00F0, 0x00F8, 0x00FC };
+            static const PRUint16 sShift[7]       = { 0, 0, 6, 12, 18, 24, 30 };
+
+              /*
+               * Unlike the algorithm in http://www.ietf.org/rfc/rfc2279.txt
+               *  we must calculate the bytes in left to right order so that
+               *  our hash result matches what the narrow version would calculate
+               *  on an already UTF-8 string.
+               */
+
+              // hash the first (and often, only, byte)
+            h = (h>>28) ^ (h<<4) ^ (sBytePrefix[code_length] | (U>>sShift[code_length]));
+
+              // an unrolled loop for hashing any remaining bytes in this sequence
+            switch ( code_length )
+              {  // falling through in each case
+                case 6:   h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>24) & 0x003F));
+                case 5:   h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>18) & 0x003F));
+                case 4:   h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>12) & 0x003F));
+                case 3:   h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>6 ) & 0x003F));
+                case 2:   h = (h>>28) ^ (h<<4) ^ (0x80 | ( U      & 0x003F));
+                default:  code_length = 0;
+                  break;
+              }
+          }
+      }
  }
-  if (resultingStrLen)
-    *resultingStrLen = len;
-  return hc;
+
+  if ( resultingStrLen )
+    *resultingStrLen = (s-str)-1;
+  return h;
 }

-PRUint32 nsCRT::BufferHashCode(const char* buf, PRUint32 len)
+PRUint32 nsCRT::BufferHashCode(const char* s, PRUint32 len)
 {
-  PRUint32 hc = 0;
-  for (PRUint32 i = 0; i < len; i++) {
-    char ch = *buf++;
-    // FYI: hc = hc*37 + ch
-    hc = ((hc << 5) + (hc << 2) + hc) + ch;
-  }
-  return hc;
+  PRUint32 h = 0;
+  const char* done = s + len;
+
+  while ( s < done )
+    h = (h>>28) ^ (h<<4) ^ PRUint8(*s++); // cast to unsigned to prevent possible sign extension
+
+  return h;
 }

 PRInt32 nsCRT::atoi( const PRUnichar *string )