Implement a in-place (no copy) CompareUTF8toUTF16, and use it to make the atom

hashtable lookups zero-copy. Patch by jst, bug 314465 (with lots of the discussion in bug 277479), r=bsmedberg,dbaron,brendan (on the PLDHashTable keyhash value assumptions), sr=bzbarsky, moa=shaver.
2024-11-29 07:42:04 +00:00 · 2005-11-04 19:52:18 +00:00 · 2005-11-04 19:52:18 +00:00 · aee1056ad7
commit aee1056ad7
parent e7a8486231
6 changed files with 753 additions and 132 deletions
--- a/xpcom/ds/nsAtomTable.cpp
+++ b/xpcom/ds/nsAtomTable.cpp
@ -43,7 +43,6 @@
 #include "nsCRT.h"
 #include "pldhash.h"
 #include "prenv.h"
-#include "nsVoidArray.h"

 #define PL_ARENA_CONST_ALIGN_MASK 3
 #include "plarena.h"
@ -94,65 +93,140 @@ private:
  const nsStaticAtom* mStaticAtom;
 };

-// the atomtableentry can contain either an AtomImpl or a
-// nsStaticAtomWrapper, indicated by the first bit of PtrBits
-typedef unsigned long PtrBits;
+// The |key| pointer in the various PLDHashTable callbacks we use is an
+// AtomTableClearEntry*.  These pointers can come from two places: either a
+// (probably stack-allocated) string key being passed to PL_DHashTableOperate,
+// or an actual entry in the atom table. PLDHashTable reseves the keyHash
+// values 0 and 1 for internal use, which means that the *PLDHashTable code*
+// will never pass an entry whose keyhash is 0 or 1 to our hooks. That means we
+// can use those values to tell whether an AtomTableEntry is a string key
+// created by a PLDHashTable code caller or an actual live AtomTableEntry used
+// by our PLDHashTable.
+//
+// Evil? Yes, but kinda neat too :-)
+//
+// An AtomTableEntry is a UTF-8 string key if keyHash is 0, in that
+// case mBits points to a UTF-8 encoded char *. If keyHash is 1 the
+// AtomTableEntry is a UTF-16 encoded string key and mBits points to a
+// UTF-16 encoded PRUnichar *.
+//
+// If keyHash is any other value (> 1), the AtomTableEntry is an
+// actual live entry in the table, and then mBits & ~0x1 in the
+// AtomTableEntry points to an AtomImpl or a nsStaticAtomWrapper,
+// indicated by the first bit of PtrBits.
+typedef PRUword PtrBits;

 struct AtomTableEntry : public PLDHashEntryHdr {
-  // mAtom & 0x1 means (mAtom & ~0x1) points to an nsStaticAtomWrapper
-  // else it points to an nsAtomImpl
-  PtrBits mAtom;
+  // If keyHash > 1, mBits & 0x1 means (mBits & ~0x1) points to an
+  // nsStaticAtomWrapper else it points to an nsAtomImpl
+  PtrBits mBits;
+
+  inline AtomTableEntry(const char *aString)
+    : mBits(PtrBits(aString))
+  {
+    keyHash = 0;
+  }
+
+  inline AtomTableEntry(const PRUnichar *aString)
+    : mBits(PtrBits(aString))
+  {
+    keyHash = 1;
+  }

  inline PRBool IsStaticAtom() const {
-    return (mAtom & 0x1) != 0;
+    NS_ASSERTION(keyHash > 1,
+                 "IsStaticAtom() called on non-atom AtomTableEntry!");
+    return (mBits & 0x1) != 0;
  }
-  
+
+  inline PRBool IsUTF8String() const {
+    return keyHash == 0;
+  }
+
+  inline PRBool IsUTF16String() const {
+    return keyHash == 1;
+  }
+
  inline void SetAtomImpl(AtomImpl* aAtom) {
+    NS_ASSERTION(keyHash > 1,
+                 "SetAtomImpl() called on non-atom AtomTableEntry!");
    NS_ASSERTION(aAtom, "Setting null atom");
-    mAtom = PtrBits(aAtom);
+    mBits = PtrBits(aAtom);
  }

  inline void SetStaticAtomWrapper(nsStaticAtomWrapper* aAtom) {
+    NS_ASSERTION(keyHash > 1,
+                 "SetStaticAtomWrapper() called on non-atom AtomTableEntry!");
    NS_ASSERTION(aAtom, "Setting null atom");
    NS_ASSERTION((PtrBits(aAtom) & ~0x1) == PtrBits(aAtom),
                 "Pointers must align or this is broken");
-    
-    mAtom = PtrBits(aAtom) | 0x1;
+
+    mBits = PtrBits(aAtom) | 0x1;
  }
  
  inline void ClearAtom() {
-    mAtom = nsnull;
+    mBits = nsnull;
  }

  inline PRBool HasValue() const {
-    return (mAtom & ~0x1) != 0;
+    NS_ASSERTION(keyHash > 1,
+                 "HasValue() called on non-atom AtomTableEntry!");
+    return (mBits & ~0x1) != 0;
  }

  // these accessors assume that you already know the type
  inline AtomImpl *GetAtomImpl() const {
+    NS_ASSERTION(keyHash > 1,
+                 "GetAtomImpl() called on non-atom AtomTableEntry!");
    NS_ASSERTION(!IsStaticAtom(), "This is a static atom, not an AtomImpl");
-    return (AtomImpl*) (mAtom & ~0x1);
+    return (AtomImpl*) (mBits & ~0x1);
  }
  
  inline nsStaticAtomWrapper *GetStaticAtomWrapper() const {
+    NS_ASSERTION(keyHash > 1,
+                 "GetStaticAtomWrapper() called on non-atom AtomTableEntry!");
    NS_ASSERTION(IsStaticAtom(), "This is an AtomImpl, not a static atom");
-    return (nsStaticAtomWrapper*) (mAtom & ~0x1);
+    return (nsStaticAtomWrapper*) (mBits & ~0x1);
  }

  inline const nsStaticAtom* GetStaticAtom() const {
+    NS_ASSERTION(keyHash > 1,
+                 "GetStaticAtom() called on non-atom AtomTableEntry!");
    return GetStaticAtomWrapper()->GetStaticAtom();
  }

  // type-agnostic accessors

  // get the string buffer
-  inline const char* get() const {
+  inline const char* getAtomString() const {
+    NS_ASSERTION(keyHash > 1,
+                 "getAtomString() called on non-atom AtomTableEntry!");
+
    return IsStaticAtom() ? GetStaticAtom()->mString : GetAtomImpl()->mString;
  }

+  // get the string buffer
+  inline const char* getUTF8String() const {
+    NS_ASSERTION(keyHash == 0,
+                 "getUTF8String() called on non-UTF8 AtomTableEntry!");
+
+    return (char *)mBits;
+  }
+
+  // get the string buffer
+  inline const PRUnichar* getUTF16String() const {
+    NS_ASSERTION(keyHash == 1,
+                 "getUTF16String() called on non-UTF16 AtomTableEntry!");
+
+    return (PRUnichar *)mBits;
+  }
+
  // get an addreffed nsIAtom - not using already_AddRef'ed atom
  // because the callers are not (and should not be) using nsCOMPtr
  inline nsIAtom* GetAtom() const {
+    NS_ASSERTION(keyHash > 1,
+                 "GetAtom() called on non-atom AtomTableEntry!");
+
    nsIAtom* result;
    
    if (IsStaticAtom())
@ -171,17 +245,44 @@ AtomTableGetKey(PLDHashTable *table, PLDHashEntryHdr *entry)
 {
  AtomTableEntry *he = NS_STATIC_CAST(AtomTableEntry*, entry);
  NS_ASSERTION(he->HasValue(), "Empty atom. how did that happen?");
-  return he->get();
+  return he;
+}
+
+PR_STATIC_CALLBACK(PLDHashNumber)
+AtomTableGetHash(PLDHashTable *table, const void *key)
+{
+  const AtomTableEntry *e = NS_STATIC_CAST(const AtomTableEntry*, key);
+
+  if (e->IsUTF16String()) {
+    return nsCRT::HashCodeAsUTF8(e->getUTF16String());
+  }
+
+  NS_ASSERTION(e->IsUTF8String(),
+               "AtomTableGetHash() called on non-string-key AtomTableEntry!");
+
+  return nsCRT::HashCode(e->getUTF8String());
 }

 PR_STATIC_CALLBACK(PRBool)
-AtomTableMatchKey(PLDHashTable *table,
-                  const PLDHashEntryHdr *entry,
+AtomTableMatchKey(PLDHashTable *table, const PLDHashEntryHdr *entry,
                  const void *key)
 {
  const AtomTableEntry *he = NS_STATIC_CAST(const AtomTableEntry*, entry);
-  const char* keyStr = NS_STATIC_CAST(const char*, key);
-  return nsCRT::strcmp(keyStr, he->get()) == 0;
+  const AtomTableEntry *strKey = NS_STATIC_CAST(const AtomTableEntry*, key);
+
+  const char *atomString = he->getAtomString();
+
+  if (strKey->IsUTF16String()) {
+    return
+      CompareUTF8toUTF16(nsDependentCString(atomString),
+                         nsDependentString(strKey->getUTF16String())) == 0;
+  }
+
+  if (strKey->IsUTF8String()) {
+    return strcmp(atomString, strKey->getUTF8String()) == 0;
+  }
+
+  return strcmp(atomString, strKey->getAtomString()) == 0;
 }

 PR_STATIC_CALLBACK(void)
@ -189,8 +290,6 @@ AtomTableClearEntry(PLDHashTable *table, PLDHashEntryHdr *entry)
 {
  AtomTableEntry *he = NS_STATIC_CAST(AtomTableEntry*, entry);
  
-  he->keyHash = 0;
-
  if (!he->IsStaticAtom()) {
    AtomImpl *atom = he->GetAtomImpl();
    // Normal |AtomImpl| atoms are deleted when their refcount hits 0, and
@ -199,8 +298,11 @@ AtomTableClearEntry(PLDHashTable *table, PLDHashEntryHdr *entry)
    // |PermanentAtomImpl| permanent atoms ignore their refcount and are
    // deleted when they are removed from the table at table destruction.
    // In other words, they are owned by the atom table.
-    if (atom->IsPermanent())
+    if (atom->IsPermanent()) {
+      he->keyHash = 0;
+
      delete NS_STATIC_CAST(PermanentAtomImpl*, atom);
+    }
  }
  else {
    he->GetStaticAtomWrapper()->~nsStaticAtomWrapper();
@ -213,7 +315,7 @@ static const PLDHashTableOps AtomTableOps = {
  PL_DHashAllocTable,
  PL_DHashFreeTable,
  AtomTableGetKey,
-  PL_DHashStringKey,
+  AtomTableGetHash,
  AtomTableMatchKey,
  PL_DHashMoveEntryStub,
  AtomTableClearEntry,
@ -260,7 +362,8 @@ void PromoteToPermanent(AtomImpl* aAtom)
  aAtom = new (aAtom) PermanentAtomImpl();
 }

-void NS_PurgeAtomTable()
+void
+NS_PurgeAtomTable()
 {
  if (gAtomTable.ops) {
 #ifdef DEBUG
@ -295,7 +398,8 @@ AtomImpl::~AtomImpl()
  // don't want to remove them twice.  See comment above in
  // |AtomTableClearEntry|.
  if (!IsPermanent()) {
-    PL_DHashTableOperate(&gAtomTable, mString, PL_DHASH_REMOVE);
+    AtomTableEntry key(mString);
+    PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_REMOVE);
    if (gAtomTable.entryCount == 0) {
      PL_DHashTableFinish(&gAtomTable);
      NS_ASSERTION(gAtomTable.entryCount == 0,
@ -388,7 +492,8 @@ AtomImpl::EqualsUTF8(const nsACString& aString, PRBool* aResult)
 NS_IMETHODIMP
 AtomImpl::Equals(const nsAString& aString, PRBool* aResult)
 {
-  *aResult = NS_ConvertUTF16toUTF8(aString).Equals(mString);
+  *aResult = CompareUTF8toUTF16(nsDependentCString(mString),
+                                PromiseFlatString(aString)) == 0;
  return NS_OK;
 }

@ -445,21 +550,12 @@ nsStaticAtomWrapper::EqualsUTF8(const nsACString& aString, PRBool* aResult)
 NS_IMETHODIMP
 nsStaticAtomWrapper::Equals(const nsAString& aString, PRBool* aResult)
 {
-  *aResult = NS_ConvertUCS2toUTF8(aString).Equals(mStaticAtom->mString);
+  *aResult = CompareUTF8toUTF16(nsDependentCString(mStaticAtom->mString),
+                                PromiseFlatString(aString)) == 0;
  return NS_OK;
 }
 //----------------------------------------------------------------------

-NS_COM nsIAtom* NS_NewAtom(const char* isolatin1)
-{
-  return NS_NewAtom(nsDependentCString(isolatin1));
-}
-
-NS_COM nsIAtom* NS_NewPermanentAtom(const char* isolatin1)
-{
-  return NS_NewPermanentAtom(NS_ConvertASCIItoUCS2(isolatin1));
-}
-
 static nsStaticAtomWrapper*
 WrapStaticAtom(const nsStaticAtom* aAtom)
 {
@ -480,7 +576,8 @@ WrapStaticAtom(const nsStaticAtom* aAtom)
  return wrapper;
 }

-static AtomTableEntry* GetAtomHashEntry(const char* aString)
+static inline AtomTableEntry*
+GetAtomHashEntry(const char* aString)
 {
  if (!gAtomTable.ops &&
      !PL_DHashTableInit(&gAtomTable, &AtomTableOps, 0,
@ -488,10 +585,25 @@ static AtomTableEntry* GetAtomHashEntry(const char* aString)
    gAtomTable.ops = nsnull;
    return nsnull;
  }
+
+  AtomTableEntry key(aString);
  return NS_STATIC_CAST(AtomTableEntry*,
-                        PL_DHashTableOperate(&gAtomTable,
-                                             aString,
-                                             PL_DHASH_ADD));
+                        PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_ADD));
+}
+
+static inline AtomTableEntry*
+GetAtomHashEntry(const PRUnichar* aString)
+{
+  if (!gAtomTable.ops &&
+      !PL_DHashTableInit(&gAtomTable, &AtomTableOps, 0,
+                         sizeof(AtomTableEntry), 2048)) {
+    gAtomTable.ops = nsnull;
+    return nsnull;
+  }
+
+  AtomTableEntry key(aString);
+  return NS_STATIC_CAST(AtomTableEntry*,
+                        PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_ADD));
 }

 NS_COM nsresult
@ -499,7 +611,7 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
 {
  // this does two things:
  // 1) wraps each static atom in a wrapper, if necessary
-  // 2) initializes the address pointed to by each mAtom slot
+  // 2) initializes the address pointed to by each mBits slot
  
  for (PRUint32 i=0; i<aAtomCount; i++) {
    NS_ASSERTION(nsCRT::IsAscii(aAtoms[i].mString),
@ -509,7 +621,7 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
    
    if (he->HasValue() && aAtoms[i].mAtom) {
      // there already is an atom with this name in the table.. but we
-      // still have to update mAtom
+      // still have to update mBits
      if (!he->IsStaticAtom() && !he->GetAtomImpl()->IsPermanent()) {
        // since we wanted to create a static atom but there is
        // already one there, we convert it to a non-refcounting
@ -536,23 +648,25 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
  return NS_OK;
 }

-NS_COM nsIAtom* NS_NewAtom( const nsAString& aString )
+NS_COM nsIAtom*
+NS_NewAtom(const char* aUTF8String)
 {
-  NS_ConvertUCS2toUTF8 utf8String(aString);
+  AtomTableEntry *he = GetAtomHashEntry(aUTF8String);

-  return NS_NewAtom(utf8String);
-}
+  if (!he) {
+    return nsnull;
+  }

-NS_COM
-nsIAtom*
-NS_NewAtom( const nsACString& aString )
-{
-  AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aString).get());
+  NS_ASSERTION(!he->IsUTF8String() && !he->IsUTF16String(),
+               "Atom hash entry is string?  Should be atom!");

  if (he->HasValue())
    return he->GetAtom();

-  AtomImpl* atom = new (aString) AtomImpl();
+  // MSVC.NET doesn't like passing a temporary nsDependentCString() to
+  // operator new, so declare one as a local instead.
+  nsDependentCString str(aUTF8String);
+  AtomImpl* atom = new (str) AtomImpl();
  he->SetAtomImpl(atom);
  if (!atom) {
    PL_DHashTableRawRemove(&gAtomTable, he);
@ -563,15 +677,50 @@ NS_NewAtom( const nsACString& aString )
  return atom;
 }

-NS_COM nsIAtom* NS_NewPermanentAtom( const nsAString& aString )
+NS_COM nsIAtom*
+NS_NewAtom(const nsACString& aUTF8String)
 {
-  return NS_NewPermanentAtom(NS_ConvertUCS2toUTF8(aString));
+  return NS_NewAtom(PromiseFlatCString(aUTF8String).get());
 }

-NS_COM
-nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
+NS_COM nsIAtom*
+NS_NewAtom(const PRUnichar* aUTF16String)
 {
-  AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aString).get());
+  AtomTableEntry *he = GetAtomHashEntry(aUTF16String);
+
+  if (he->HasValue())
+    return he->GetAtom();
+
+  // MSVC.NET doesn't like passing a temporary NS_ConvertUTF16toUTF8() to
+  // operator new, so declare one as a local instead.
+  NS_ConvertUTF16toUTF8 str(aUTF16String);
+  AtomImpl* atom = new (str) AtomImpl();
+  he->SetAtomImpl(atom);
+  if (!atom) {
+    PL_DHashTableRawRemove(&gAtomTable, he);
+    return nsnull;
+  }
+
+  NS_ADDREF(atom);
+  return atom;
+}
+
+NS_COM nsIAtom*
+NS_NewAtom(const nsAString& aUTF16String)
+{
+  return NS_NewAtom(PromiseFlatString(aUTF16String).get());
+}
+
+NS_COM nsIAtom*
+NS_NewPermanentAtom(const char* aUTF8String)
+{
+  return NS_NewPermanentAtom(nsDependentCString(aUTF8String));
+}
+
+NS_COM nsIAtom*
+NS_NewPermanentAtom(const nsACString& aUTF8String)
+{
+  AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aUTF8String).get());

  if (he->HasValue() && he->IsStaticAtom())
    return he->GetStaticAtomWrapper();
@ -587,7 +736,7 @@ nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
    }
  } else {
    // otherwise, make a new atom
-    atom = new (aString) PermanentAtomImpl();
+    atom = new (aUTF8String) PermanentAtomImpl();
    he->SetAtomImpl(atom);
    if ( !atom ) {
      PL_DHashTableRawRemove(&gAtomTable, he);
@ -599,17 +748,20 @@ nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
  return atom;
 }

-NS_COM nsIAtom* NS_NewAtom( const PRUnichar* str )
+NS_COM nsIAtom*
+NS_NewPermanentAtom(const nsAString& aUTF16String)
 {
-  return NS_NewAtom(NS_ConvertUCS2toUTF8(str));
+  return NS_NewPermanentAtom(NS_ConvertUTF16toUTF8(aUTF16String));
 }

-NS_COM nsIAtom* NS_NewPermanentAtom( const PRUnichar* str )
+NS_COM nsIAtom*
+NS_NewPermanentAtom(const PRUnichar* aUTF16String)
 {
-  return NS_NewPermanentAtom(nsDependentString(str));
+  return NS_NewPermanentAtom(NS_ConvertUTF16toUTF8(aUTF16String));
 }

-NS_COM nsrefcnt NS_GetNumberOfAtoms(void)
+NS_COM nsrefcnt
+NS_GetNumberOfAtoms(void)
 {
  return gAtomTable.entryCount;
 }
--- a/xpcom/ds/nsCRT.cpp
+++ b/xpcom/ds/nsCRT.cpp
@ -300,6 +300,100 @@ PRUint32 nsCRT::HashCode(const PRUnichar* str, PRUint32* resultingStrLen)
  return h;
 }

+PRUint32 nsCRT::HashCodeAsUTF8(const PRUnichar* str, PRUint32* resultingStrLen)
+{
+  PRUint32 h = 0;
+  const PRUnichar* s = str;
+
+  {
+    PRUint16 W1 = 0;      // the first UTF-16 word in a two word tuple
+    PRUint32 U = 0;       // the current char as UCS-4
+    int code_length = 0;  // the number of bytes in the UTF-8 sequence for the current char
+
+    PRUint16 W;
+    while ( (W = *s++) )
+      {
+          /*
+           * On the fly, decoding from UTF-16 (and/or UCS-2) into UTF-8 as per
+           *  http://www.ietf.org/rfc/rfc2781.txt
+           *  http://www.ietf.org/rfc/rfc2279.txt
+           */
+
+        if ( !W1 )
+          {
+            if ( W < 0xD800 || 0xDFFF < W )
+              {
+                U = W;
+                if ( W <= 0x007F )
+                  code_length = 1;
+                else if ( W <= 0x07FF )
+                  code_length = 2;
+                else
+                  code_length = 3;
+              }
+            else if ( /* 0xD800 <= W1 && */ W <= 0xDBFF )
+              W1 = W;
+#ifdef DEBUG
+            else NS_ERROR("Got low surrogate but no previous high surrogate");
+#endif
+          }
+        else
+          {
+              // as required by the standard, this code is careful to
+              //  throw out illegal sequences
+
+            if ( 0xDC00 <= W && W <= 0xDFFF )
+              {
+                U = PRUint32( (W1&0x03FF)<<10 | (W&0x3FFF) );
+                if ( U <= 0x001FFFFF )
+                  code_length = 4;
+                else if ( U <= 0x3FFFFFF )
+                  code_length = 5;
+                else
+                  code_length = 6;
+              }
+#ifdef DEBUG
+            else NS_ERROR("High surrogate not followed by low surrogate");
+#endif
+            W1 = 0;
+          }
+
+
+        if ( code_length > 0 )
+          {
+            static const PRUint16 sBytePrefix[7]  = { 0x0000, 0x0000, 0x00C0, 0x00E0, 0x00F0, 0x00F8, 0x00FC };
+            static const PRUint16 sShift[7]       = { 0, 0, 6, 12, 18, 24, 30 };
+
+              /*
+               * Unlike the algorithm in http://www.ietf.org/rfc/rfc2279.txt
+               *  we must calculate the bytes in left to right order so that
+               *  our hash result matches what the narrow version would calculate
+               *  on an already UTF-8 string.
+               */
+
+              // hash the first (and often, only, byte)
+            h = (h>>28) ^ (h<<4) ^ (sBytePrefix[code_length] | (U>>sShift[code_length]));
+
+              // an unrolled loop for hashing any remaining bytes in this sequence
+            switch ( code_length )
+              {  // falling through in each case
+                case 6:   h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>24) & 0x003F));
+                case 5:   h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>18) & 0x003F));
+                case 4:   h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>12) & 0x003F));
+                case 3:   h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>6 ) & 0x003F));
+                case 2:   h = (h>>28) ^ (h<<4) ^ (0x80 | ( U      & 0x003F));
+                default:  code_length = 0;
+                  break;
+              }
+          }
+      }
+  }
+
+  if ( resultingStrLen )
+    *resultingStrLen = (s-str)-1;
+  return h;
+}
+
 PRUint32 nsCRT::BufferHashCode(const PRUnichar* s, PRUint32 len)
 {
  PRUint32 h = 0;
--- a/xpcom/ds/nsCRT.h
+++ b/xpcom/ds/nsCRT.h
@ -228,6 +228,12 @@ public:
  static PRUint32 HashCode(const PRUnichar* str,
                           PRUint32* resultingStrLen = nsnull);

+  // Computes a hashcode for a ucs2 string that returns the same thing
+  // as the HashCode method taking a |char*| would if the string were
+  // converted to UTF8.  Returns the string length as an added bonus.
+  static PRUint32 HashCodeAsUTF8(const PRUnichar* str,
+                                 PRUint32* resultingStrLen = nsnull);
+
  // Computes the hashcode for a buffer with a specified length.
  static PRUint32 BufferHashCode(const PRUnichar* str, PRUint32 strLen);

--- a/xpcom/string/public/nsReadableUtils.h
+++ b/xpcom/string/public/nsReadableUtils.h
@ -364,4 +364,16 @@ NS_COM const nsAFlatString& EmptyString();
 NS_COM const nsAFlatCString& EmptyCString();


+   /**
+   * Compare a UTF-8 string to an UTF-16 string.
+   *
+   * Returns 0 if the strings are equal, -1 if aUTF8String is less
+   * than aUTF16Count, and 1 in the reverse case.  In case of fatal
+   * error (eg the strings are not valid UTF8 and UTF16 respectively),
+   * this method will return PR_INT32_MIN.
+   */
+NS_COM PRInt32
+CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
+                   const nsASingleFragmentString& aUTF16String);
+
 #endif // !defined(nsReadableUtils_h___)
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@ -35,7 +35,6 @@
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */
-
 #ifndef nsUTF8Utils_h_
 #define nsUTF8Utils_h_

@ -60,6 +59,349 @@ class UTF8traits
 #define NS_ALWAYS_INLINE
 #endif

+/**
+ * Extract the next UCS-4 character from the buffer and return it.  The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer.  If non-null, the parameters err and overlong are filled in to
+ * indicate that the character was represented by an overlong sequence, or
+ * that an error occurred.
+ */
+
+class UTF8CharEnumerator
+{
+public:
+  static PRUint32 NextChar(const char **buffer, const char *end,
+                           PRBool *err = nsnull, PRBool* overlong = nsnull)
+  {
+    NS_ASSERTION(buffer && *buffer, "null buffer!");
+
+    const char *p = *buffer;
+
+    if (p >= end)
+      {
+        if (err)
+          *err = PR_TRUE;
+
+        return 0;
+      }
+
+    char c = *p++;
+
+    if ( UTF8traits::isASCII(c) )
+      {
+        if (err)
+          *err = PR_FALSE;
+        if (overlong)
+          *overlong = PR_FALSE;
+        *buffer = p;
+        return c;
+      }
+
+    PRUint32 ucs4;
+    PRUint32 minUcs4;
+    PRInt32 state = 0;
+
+    if (!CalcState(c, ucs4, minUcs4, state)) {
+        NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+        if (err)
+          *err = PR_TRUE;
+        return 0;
+    }
+
+    while ( state-- )
+      {
+        if (p == end)
+          {
+            if (err)
+              *err = PR_TRUE;
+
+            return 0;
+          }
+
+        c = *p++;
+
+        if (!AddByte(c, state, ucs4))
+          {
+            NS_ERROR("not a UTF8 string");
+            if (err)
+              *err = PR_TRUE;
+            return 0;
+          }
+      }
+
+    if (err)
+      *err = PR_FALSE;
+    if (overlong)
+      *overlong = ucs4 < minUcs4;
+    *buffer = p;
+    return ucs4;
+  }
+
+  static PRUint32 NextChar(nsACString::const_iterator& iter,
+                           const nsACString::const_iterator& end,
+                           PRBool *err = nsnull, PRBool *overlong = nsnull)
+  {
+    if ( iter == end )
+      {
+        NS_ERROR("No input to work with");
+        if (err)
+          *err = PR_TRUE;
+
+        return 0;
+      }
+
+    char c = *iter++;
+
+    if ( UTF8traits::isASCII(c) )
+      {
+        if (err)
+          *err = PR_FALSE;
+        if (overlong)
+          *overlong = PR_FALSE;
+        return c;
+      }
+
+    PRUint32 ucs4;
+    PRUint32 minUcs4;
+    PRInt32 state = 0;
+
+    if (!CalcState(c, ucs4, minUcs4, state)) {
+        NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+        if (err)
+          *err = PR_TRUE;
+        return 0;
+    }
+
+    while ( state-- )
+      {
+        if (iter == end)
+          {
+            NS_ERROR("Buffer ended in the middle of a multibyte sequence");
+            if (err)
+              *err = PR_TRUE;
+
+            return 0;
+          }
+
+        c = *iter++;
+
+        if (!AddByte(c, state, ucs4))
+          {
+            NS_ERROR("not a UTF8 string");
+            if (err)
+              *err = PR_TRUE;
+            return 0;
+          }
+      }
+
+    if (err)
+      *err = PR_FALSE;
+    if (overlong)
+      *overlong = ucs4 < minUcs4;
+    return ucs4;
+  }
+
+private:
+  static PRBool CalcState(char c, PRUint32& ucs4, PRUint32& minUcs4,
+                          PRInt32& state)
+  {
+    if ( UTF8traits::is2byte(c) )
+      {
+        ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
+        state = 1;
+        minUcs4 = 0x00000080;
+      }
+    else if ( UTF8traits::is3byte(c) )
+      {
+        ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
+        state = 2;
+        minUcs4 = 0x00000800;
+      }
+    else if ( UTF8traits::is4byte(c) )
+      {
+        ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
+        state = 3;
+        minUcs4 = 0x00010000;
+      }
+    else if ( UTF8traits::is5byte(c) )
+      {
+        ucs4 = (PRUint32(c) << 24) & 0x03000000L;
+        state = 4;
+        minUcs4 = 0x00200000;
+      }
+    else if ( UTF8traits::is6byte(c) )
+      {
+        ucs4 = (PRUint32(c) << 30) & 0x40000000L;
+        state = 5;
+        minUcs4 = 0x04000000;
+      }
+    else
+      {
+        return PR_FALSE;
+      }
+
+    return PR_TRUE;
+  }
+
+  static PRBool AddByte(char c, PRInt32 state, PRUint32& ucs4)
+  {
+    if ( UTF8traits::isInSeq(c) )
+      {
+        PRInt32 shift = state * 6;
+        ucs4 |= (PRUint32(c) & 0x3F) << shift;
+        return PR_TRUE;
+      }
+
+    return PR_FALSE;
+  }
+};
+
+
+/**
+ * Extract the next UCS-4 character from the buffer and return it.  The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer.  If non-null, the err parameter is filled in if an error occurs.
+ */
+
+
+class UTF16CharEnumerator
+{
+public:
+  static PRUint32 NextChar(const PRUnichar **buffer, const PRUnichar *end,
+                           PRBool *err = nsnull)
+  {
+    NS_ASSERTION(buffer && *buffer, "null buffer!");
+
+    const PRUnichar *p = *buffer;
+
+    if (p >= end)
+      {
+        NS_ERROR("No input to work with");
+        if (err)
+          *err = PR_TRUE;
+
+        return 0;
+      }
+
+    PRUnichar c = *p++;
+
+    if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
+      {
+        if (err)
+          *err = PR_FALSE;
+        *buffer = p;
+        return c;
+      }
+    else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+      {
+        if (*buffer == end)
+          {
+            NS_ERROR("Unexpected end of buffer after high surrogate");
+            if (err)
+              *err = PR_TRUE;
+
+            return 0;
+          }
+
+        // D800- DBFF - High Surrogate
+        // N = (H- D800) *400 + 10000 + ...
+        PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+
+        c = *p++;
+
+        if (0xDC00 == (0xFC00 & c))
+          {
+            // DC00- DFFF - Low Surrogate
+            // N += ( L - DC00 )
+            ucs4 |= (0x03FF & c);
+            if (err)
+              *err = PR_FALSE;
+            *buffer = p;
+            return ucs4;
+          }
+        else
+          {
+            NS_ERROR("got a High Surrogate but no low surrogate");
+            // output nothing.
+          }
+      }
+    else // U+DC00 - U+DFFF
+      {
+        // DC00- DFFF - Low Surrogate
+        NS_ERROR("got a low Surrogate but no high surrogate");
+        // output nothing.
+      }
+
+    if (err)
+      *err = PR_TRUE;
+    return 0;
+  }
+
+  static PRUint32 NextChar(nsAString::const_iterator& iter,
+                           const nsAString::const_iterator& end,
+                           PRBool *err = nsnull)
+  {
+    if (iter == end)
+      {
+        if (err)
+          *err = PR_TRUE;
+
+        return 0;
+      }
+
+    PRUnichar c = *iter++;
+
+    if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
+      {
+        if (err)
+          *err = PR_FALSE;
+        return c;
+      }
+    else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+      {
+        if (iter == end)
+          {
+            if (err)
+              *err = PR_TRUE;
+
+            return 0;
+          }
+
+        // D800- DBFF - High Surrogate
+        // N = (H- D800) *400 + 10000 + ...
+        PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+
+        c = *iter++;
+
+        if (0xDC00 == (0xFC00 & c))
+          {
+            // DC00- DFFF - Low Surrogate
+            // N += ( L - DC00 )
+            ucs4 |= (0x03FF & c);
+            if (err)
+              *err = PR_FALSE;
+            return ucs4;
+          }
+        else
+          {
+            NS_ERROR("got a High Surrogate but no low surrogate");
+            // output nothing.
+          }
+      }
+    else // U+DC00 - U+DFFF
+      {
+        // DC00- DFFF - Low Surrogate
+        NS_ERROR("got a low Surrogate but no high surrogate");
+        // output nothing.
+      }
+
+    if (err)
+      *err = PR_TRUE;
+    return 0;
+  }
+};
+
+
 /**
 * A character sink (see |copy_string| in nsAlgorithm.h) for converting
 * UTF-8 to UTF-16
@ -87,75 +429,18 @@ class ConvertUTF8toUTF16
        buffer_type* out = mBuffer;
        for ( ; p != end /* && *p */; )
          {
-            char c = *p++;
+            PRBool overlong, err;
+            PRUint32 ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err,
+                                                         &overlong);

-            if ( UTF8traits::isASCII(c) )
+            if ( err )
              {
-                *out++ = buffer_type(c);
-                continue;
-              }
-
-            PRUint32 ucs4;
-            PRUint32 minUcs4;
-            PRInt32 state = 0;
-
-            if ( UTF8traits::is2byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
-                state = 1;
-                minUcs4 = 0x00000080;
-              }
-            else if ( UTF8traits::is3byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
-                state = 2;
-                minUcs4 = 0x00000800;
-              }
-            else if ( UTF8traits::is4byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
-                state = 3;
-                minUcs4 = 0x00010000;
-              }
-            else if ( UTF8traits::is5byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 24) & 0x03000000L;
-                state = 4;
-                minUcs4 = 0x00200000;
-              }
-            else if ( UTF8traits::is6byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 30) & 0x40000000L;
-                state = 5;
-                minUcs4 = 0x04000000;
-              }
-            else
-              {
-                NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
                mErrorEncountered = PR_TRUE;
                mBuffer = out;
                return N;
              }

-            while ( state-- )
-              {
-                c = *p++;
-
-                if ( UTF8traits::isInSeq(c) )
-                  {
-                    PRInt32 shift = state * 6;
-                    ucs4 |= (PRUint32(c) & 0x3F) << shift;
-                  }
-                else
-                  {
-                    NS_ERROR("not a UTF8 string");
-                    mErrorEncountered = PR_TRUE;
-                    mBuffer = out;
-                    return N;
-                  }
-              }
-
-            if ( ucs4 < minUcs4 )
+            if ( overlong )
              {
                // Overlong sequence
                *out++ = UCS2_REPLACEMENT_CHAR;
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@ -1081,16 +1081,88 @@ StringEndsWith( const nsACString& aSource, const nsACString& aSubstring,

 static const PRUnichar empty_buffer[1] = { '\0' };

-NS_COM const nsAFlatString& EmptyString()
+NS_COM
+const nsAFlatString&
+EmptyString()
  {
    static const nsDependentString sEmpty(empty_buffer);

    return sEmpty;
  }

-NS_COM const nsAFlatCString& EmptyCString()
+NS_COM
+const nsAFlatCString&
+EmptyCString()
  {
    static const nsDependentCString sEmpty((const char *)empty_buffer);

    return sEmpty;
  }
+
+NS_COM PRInt32
+CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
+                   const nsASingleFragmentString& aUTF16String)
+  {
+    static const PRUint32 NOT_ASCII = PRUint32(~0x7F);
+
+    const char *u8, *u8end;
+    aUTF8String.BeginReading(u8);
+    aUTF8String.EndReading(u8end);
+
+    const PRUnichar *u16, *u16end;
+    aUTF16String.BeginReading(u16);
+    aUTF16String.EndReading(u16end);
+
+    while (u8 != u8end && u16 != u16end)
+      {
+        // Cast away the signedness of *u8 to prevent signextension when
+        // converting to PRUint32
+        PRUint32 c8_32 = (PRUint8)*u8;
+
+        if (c8_32 & NOT_ASCII)
+          {
+            PRBool err;
+            c8_32 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
+            if (err)
+              return PR_INT32_MIN;
+
+            PRUint32 c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end,
+                                                            &err);
+            if (err)
+              return PR_INT32_MIN;
+
+            if (c8_32 != c16_32)
+              return c8_32 < c16_32 ? -1 : 1;
+          }
+        else
+          {
+            if (c8_32 != *u16)
+              return c8_32 > *u16 ? 1 : -1;
+
+            ++u8;
+            ++u16;
+          }
+      }
+
+    if (u8 != u8end)
+      {
+        // We get to the end of the UTF16 string, but no to the end of
+        // the UTF8 string. The UTF8 string is longer than the UTF16
+        // string
+
+        return 1;
+      }
+
+    if (u16 != u16end)
+      {
+        // We get to the end of the UTF8 string, but no to the end of
+        // the UTF16 string. The UTF16 string is longer than the UTF8
+        // string
+
+        return -1;
+      }
+
+    // The two strings match.
+
+    return 0;
+  }