From aee1056ad7c2218e58cd3bd2eaf1a541ad97beb0 Mon Sep 17 00:00:00 2001 From: "bzbarsky%mit.edu" Date: Fri, 4 Nov 2005 19:52:18 +0000 Subject: [PATCH] Implement a in-place (no copy) CompareUTF8toUTF16, and use it to make the atom hashtable lookups zero-copy. Patch by jst, bug 314465 (with lots of the discussion in bug 277479), r=bsmedberg,dbaron,brendan (on the PLDHashTable keyhash value assumptions), sr=bzbarsky, moa=shaver. --- xpcom/ds/nsAtomTable.cpp | 286 +++++++++++++----- xpcom/ds/nsCRT.cpp | 94 ++++++ xpcom/ds/nsCRT.h | 6 + xpcom/string/public/nsReadableUtils.h | 12 + xpcom/string/public/nsUTF8Utils.h | 411 ++++++++++++++++++++++---- xpcom/string/src/nsReadableUtils.cpp | 76 ++++- 6 files changed, 753 insertions(+), 132 deletions(-) diff --git a/xpcom/ds/nsAtomTable.cpp b/xpcom/ds/nsAtomTable.cpp index fc92dec81bb7..26a1f9cf208b 100644 --- a/xpcom/ds/nsAtomTable.cpp +++ b/xpcom/ds/nsAtomTable.cpp @@ -43,7 +43,6 @@ #include "nsCRT.h" #include "pldhash.h" #include "prenv.h" -#include "nsVoidArray.h" #define PL_ARENA_CONST_ALIGN_MASK 3 #include "plarena.h" @@ -94,65 +93,140 @@ private: const nsStaticAtom* mStaticAtom; }; -// the atomtableentry can contain either an AtomImpl or a -// nsStaticAtomWrapper, indicated by the first bit of PtrBits -typedef unsigned long PtrBits; +// The |key| pointer in the various PLDHashTable callbacks we use is an +// AtomTableClearEntry*. These pointers can come from two places: either a +// (probably stack-allocated) string key being passed to PL_DHashTableOperate, +// or an actual entry in the atom table. PLDHashTable reseves the keyHash +// values 0 and 1 for internal use, which means that the *PLDHashTable code* +// will never pass an entry whose keyhash is 0 or 1 to our hooks. That means we +// can use those values to tell whether an AtomTableEntry is a string key +// created by a PLDHashTable code caller or an actual live AtomTableEntry used +// by our PLDHashTable. +// +// Evil? Yes, but kinda neat too :-) +// +// An AtomTableEntry is a UTF-8 string key if keyHash is 0, in that +// case mBits points to a UTF-8 encoded char *. If keyHash is 1 the +// AtomTableEntry is a UTF-16 encoded string key and mBits points to a +// UTF-16 encoded PRUnichar *. +// +// If keyHash is any other value (> 1), the AtomTableEntry is an +// actual live entry in the table, and then mBits & ~0x1 in the +// AtomTableEntry points to an AtomImpl or a nsStaticAtomWrapper, +// indicated by the first bit of PtrBits. +typedef PRUword PtrBits; struct AtomTableEntry : public PLDHashEntryHdr { - // mAtom & 0x1 means (mAtom & ~0x1) points to an nsStaticAtomWrapper - // else it points to an nsAtomImpl - PtrBits mAtom; + // If keyHash > 1, mBits & 0x1 means (mBits & ~0x1) points to an + // nsStaticAtomWrapper else it points to an nsAtomImpl + PtrBits mBits; + + inline AtomTableEntry(const char *aString) + : mBits(PtrBits(aString)) + { + keyHash = 0; + } + + inline AtomTableEntry(const PRUnichar *aString) + : mBits(PtrBits(aString)) + { + keyHash = 1; + } inline PRBool IsStaticAtom() const { - return (mAtom & 0x1) != 0; + NS_ASSERTION(keyHash > 1, + "IsStaticAtom() called on non-atom AtomTableEntry!"); + return (mBits & 0x1) != 0; } - + + inline PRBool IsUTF8String() const { + return keyHash == 0; + } + + inline PRBool IsUTF16String() const { + return keyHash == 1; + } + inline void SetAtomImpl(AtomImpl* aAtom) { + NS_ASSERTION(keyHash > 1, + "SetAtomImpl() called on non-atom AtomTableEntry!"); NS_ASSERTION(aAtom, "Setting null atom"); - mAtom = PtrBits(aAtom); + mBits = PtrBits(aAtom); } inline void SetStaticAtomWrapper(nsStaticAtomWrapper* aAtom) { + NS_ASSERTION(keyHash > 1, + "SetStaticAtomWrapper() called on non-atom AtomTableEntry!"); NS_ASSERTION(aAtom, "Setting null atom"); NS_ASSERTION((PtrBits(aAtom) & ~0x1) == PtrBits(aAtom), "Pointers must align or this is broken"); - - mAtom = PtrBits(aAtom) | 0x1; + + mBits = PtrBits(aAtom) | 0x1; } inline void ClearAtom() { - mAtom = nsnull; + mBits = nsnull; } inline PRBool HasValue() const { - return (mAtom & ~0x1) != 0; + NS_ASSERTION(keyHash > 1, + "HasValue() called on non-atom AtomTableEntry!"); + return (mBits & ~0x1) != 0; } // these accessors assume that you already know the type inline AtomImpl *GetAtomImpl() const { + NS_ASSERTION(keyHash > 1, + "GetAtomImpl() called on non-atom AtomTableEntry!"); NS_ASSERTION(!IsStaticAtom(), "This is a static atom, not an AtomImpl"); - return (AtomImpl*) (mAtom & ~0x1); + return (AtomImpl*) (mBits & ~0x1); } inline nsStaticAtomWrapper *GetStaticAtomWrapper() const { + NS_ASSERTION(keyHash > 1, + "GetStaticAtomWrapper() called on non-atom AtomTableEntry!"); NS_ASSERTION(IsStaticAtom(), "This is an AtomImpl, not a static atom"); - return (nsStaticAtomWrapper*) (mAtom & ~0x1); + return (nsStaticAtomWrapper*) (mBits & ~0x1); } inline const nsStaticAtom* GetStaticAtom() const { + NS_ASSERTION(keyHash > 1, + "GetStaticAtom() called on non-atom AtomTableEntry!"); return GetStaticAtomWrapper()->GetStaticAtom(); } // type-agnostic accessors // get the string buffer - inline const char* get() const { + inline const char* getAtomString() const { + NS_ASSERTION(keyHash > 1, + "getAtomString() called on non-atom AtomTableEntry!"); + return IsStaticAtom() ? GetStaticAtom()->mString : GetAtomImpl()->mString; } + // get the string buffer + inline const char* getUTF8String() const { + NS_ASSERTION(keyHash == 0, + "getUTF8String() called on non-UTF8 AtomTableEntry!"); + + return (char *)mBits; + } + + // get the string buffer + inline const PRUnichar* getUTF16String() const { + NS_ASSERTION(keyHash == 1, + "getUTF16String() called on non-UTF16 AtomTableEntry!"); + + return (PRUnichar *)mBits; + } + // get an addreffed nsIAtom - not using already_AddRef'ed atom // because the callers are not (and should not be) using nsCOMPtr inline nsIAtom* GetAtom() const { + NS_ASSERTION(keyHash > 1, + "GetAtom() called on non-atom AtomTableEntry!"); + nsIAtom* result; if (IsStaticAtom()) @@ -171,17 +245,44 @@ AtomTableGetKey(PLDHashTable *table, PLDHashEntryHdr *entry) { AtomTableEntry *he = NS_STATIC_CAST(AtomTableEntry*, entry); NS_ASSERTION(he->HasValue(), "Empty atom. how did that happen?"); - return he->get(); + return he; +} + +PR_STATIC_CALLBACK(PLDHashNumber) +AtomTableGetHash(PLDHashTable *table, const void *key) +{ + const AtomTableEntry *e = NS_STATIC_CAST(const AtomTableEntry*, key); + + if (e->IsUTF16String()) { + return nsCRT::HashCodeAsUTF8(e->getUTF16String()); + } + + NS_ASSERTION(e->IsUTF8String(), + "AtomTableGetHash() called on non-string-key AtomTableEntry!"); + + return nsCRT::HashCode(e->getUTF8String()); } PR_STATIC_CALLBACK(PRBool) -AtomTableMatchKey(PLDHashTable *table, - const PLDHashEntryHdr *entry, +AtomTableMatchKey(PLDHashTable *table, const PLDHashEntryHdr *entry, const void *key) { const AtomTableEntry *he = NS_STATIC_CAST(const AtomTableEntry*, entry); - const char* keyStr = NS_STATIC_CAST(const char*, key); - return nsCRT::strcmp(keyStr, he->get()) == 0; + const AtomTableEntry *strKey = NS_STATIC_CAST(const AtomTableEntry*, key); + + const char *atomString = he->getAtomString(); + + if (strKey->IsUTF16String()) { + return + CompareUTF8toUTF16(nsDependentCString(atomString), + nsDependentString(strKey->getUTF16String())) == 0; + } + + if (strKey->IsUTF8String()) { + return strcmp(atomString, strKey->getUTF8String()) == 0; + } + + return strcmp(atomString, strKey->getAtomString()) == 0; } PR_STATIC_CALLBACK(void) @@ -189,8 +290,6 @@ AtomTableClearEntry(PLDHashTable *table, PLDHashEntryHdr *entry) { AtomTableEntry *he = NS_STATIC_CAST(AtomTableEntry*, entry); - he->keyHash = 0; - if (!he->IsStaticAtom()) { AtomImpl *atom = he->GetAtomImpl(); // Normal |AtomImpl| atoms are deleted when their refcount hits 0, and @@ -199,8 +298,11 @@ AtomTableClearEntry(PLDHashTable *table, PLDHashEntryHdr *entry) // |PermanentAtomImpl| permanent atoms ignore their refcount and are // deleted when they are removed from the table at table destruction. // In other words, they are owned by the atom table. - if (atom->IsPermanent()) + if (atom->IsPermanent()) { + he->keyHash = 0; + delete NS_STATIC_CAST(PermanentAtomImpl*, atom); + } } else { he->GetStaticAtomWrapper()->~nsStaticAtomWrapper(); @@ -213,7 +315,7 @@ static const PLDHashTableOps AtomTableOps = { PL_DHashAllocTable, PL_DHashFreeTable, AtomTableGetKey, - PL_DHashStringKey, + AtomTableGetHash, AtomTableMatchKey, PL_DHashMoveEntryStub, AtomTableClearEntry, @@ -260,7 +362,8 @@ void PromoteToPermanent(AtomImpl* aAtom) aAtom = new (aAtom) PermanentAtomImpl(); } -void NS_PurgeAtomTable() +void +NS_PurgeAtomTable() { if (gAtomTable.ops) { #ifdef DEBUG @@ -295,7 +398,8 @@ AtomImpl::~AtomImpl() // don't want to remove them twice. See comment above in // |AtomTableClearEntry|. if (!IsPermanent()) { - PL_DHashTableOperate(&gAtomTable, mString, PL_DHASH_REMOVE); + AtomTableEntry key(mString); + PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_REMOVE); if (gAtomTable.entryCount == 0) { PL_DHashTableFinish(&gAtomTable); NS_ASSERTION(gAtomTable.entryCount == 0, @@ -388,7 +492,8 @@ AtomImpl::EqualsUTF8(const nsACString& aString, PRBool* aResult) NS_IMETHODIMP AtomImpl::Equals(const nsAString& aString, PRBool* aResult) { - *aResult = NS_ConvertUTF16toUTF8(aString).Equals(mString); + *aResult = CompareUTF8toUTF16(nsDependentCString(mString), + PromiseFlatString(aString)) == 0; return NS_OK; } @@ -445,21 +550,12 @@ nsStaticAtomWrapper::EqualsUTF8(const nsACString& aString, PRBool* aResult) NS_IMETHODIMP nsStaticAtomWrapper::Equals(const nsAString& aString, PRBool* aResult) { - *aResult = NS_ConvertUCS2toUTF8(aString).Equals(mStaticAtom->mString); + *aResult = CompareUTF8toUTF16(nsDependentCString(mStaticAtom->mString), + PromiseFlatString(aString)) == 0; return NS_OK; } //---------------------------------------------------------------------- -NS_COM nsIAtom* NS_NewAtom(const char* isolatin1) -{ - return NS_NewAtom(nsDependentCString(isolatin1)); -} - -NS_COM nsIAtom* NS_NewPermanentAtom(const char* isolatin1) -{ - return NS_NewPermanentAtom(NS_ConvertASCIItoUCS2(isolatin1)); -} - static nsStaticAtomWrapper* WrapStaticAtom(const nsStaticAtom* aAtom) { @@ -480,7 +576,8 @@ WrapStaticAtom(const nsStaticAtom* aAtom) return wrapper; } -static AtomTableEntry* GetAtomHashEntry(const char* aString) +static inline AtomTableEntry* +GetAtomHashEntry(const char* aString) { if (!gAtomTable.ops && !PL_DHashTableInit(&gAtomTable, &AtomTableOps, 0, @@ -488,10 +585,25 @@ static AtomTableEntry* GetAtomHashEntry(const char* aString) gAtomTable.ops = nsnull; return nsnull; } + + AtomTableEntry key(aString); return NS_STATIC_CAST(AtomTableEntry*, - PL_DHashTableOperate(&gAtomTable, - aString, - PL_DHASH_ADD)); + PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_ADD)); +} + +static inline AtomTableEntry* +GetAtomHashEntry(const PRUnichar* aString) +{ + if (!gAtomTable.ops && + !PL_DHashTableInit(&gAtomTable, &AtomTableOps, 0, + sizeof(AtomTableEntry), 2048)) { + gAtomTable.ops = nsnull; + return nsnull; + } + + AtomTableEntry key(aString); + return NS_STATIC_CAST(AtomTableEntry*, + PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_ADD)); } NS_COM nsresult @@ -499,7 +611,7 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount) { // this does two things: // 1) wraps each static atom in a wrapper, if necessary - // 2) initializes the address pointed to by each mAtom slot + // 2) initializes the address pointed to by each mBits slot for (PRUint32 i=0; iHasValue() && aAtoms[i].mAtom) { // there already is an atom with this name in the table.. but we - // still have to update mAtom + // still have to update mBits if (!he->IsStaticAtom() && !he->GetAtomImpl()->IsPermanent()) { // since we wanted to create a static atom but there is // already one there, we convert it to a non-refcounting @@ -536,23 +648,25 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount) return NS_OK; } -NS_COM nsIAtom* NS_NewAtom( const nsAString& aString ) +NS_COM nsIAtom* +NS_NewAtom(const char* aUTF8String) { - NS_ConvertUCS2toUTF8 utf8String(aString); + AtomTableEntry *he = GetAtomHashEntry(aUTF8String); - return NS_NewAtom(utf8String); -} + if (!he) { + return nsnull; + } -NS_COM -nsIAtom* -NS_NewAtom( const nsACString& aString ) -{ - AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aString).get()); + NS_ASSERTION(!he->IsUTF8String() && !he->IsUTF16String(), + "Atom hash entry is string? Should be atom!"); if (he->HasValue()) return he->GetAtom(); - AtomImpl* atom = new (aString) AtomImpl(); + // MSVC.NET doesn't like passing a temporary nsDependentCString() to + // operator new, so declare one as a local instead. + nsDependentCString str(aUTF8String); + AtomImpl* atom = new (str) AtomImpl(); he->SetAtomImpl(atom); if (!atom) { PL_DHashTableRawRemove(&gAtomTable, he); @@ -563,15 +677,50 @@ NS_NewAtom( const nsACString& aString ) return atom; } -NS_COM nsIAtom* NS_NewPermanentAtom( const nsAString& aString ) +NS_COM nsIAtom* +NS_NewAtom(const nsACString& aUTF8String) { - return NS_NewPermanentAtom(NS_ConvertUCS2toUTF8(aString)); + return NS_NewAtom(PromiseFlatCString(aUTF8String).get()); } -NS_COM -nsIAtom* NS_NewPermanentAtom( const nsACString& aString ) +NS_COM nsIAtom* +NS_NewAtom(const PRUnichar* aUTF16String) { - AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aString).get()); + AtomTableEntry *he = GetAtomHashEntry(aUTF16String); + + if (he->HasValue()) + return he->GetAtom(); + + // MSVC.NET doesn't like passing a temporary NS_ConvertUTF16toUTF8() to + // operator new, so declare one as a local instead. + NS_ConvertUTF16toUTF8 str(aUTF16String); + AtomImpl* atom = new (str) AtomImpl(); + he->SetAtomImpl(atom); + if (!atom) { + PL_DHashTableRawRemove(&gAtomTable, he); + return nsnull; + } + + NS_ADDREF(atom); + return atom; +} + +NS_COM nsIAtom* +NS_NewAtom(const nsAString& aUTF16String) +{ + return NS_NewAtom(PromiseFlatString(aUTF16String).get()); +} + +NS_COM nsIAtom* +NS_NewPermanentAtom(const char* aUTF8String) +{ + return NS_NewPermanentAtom(nsDependentCString(aUTF8String)); +} + +NS_COM nsIAtom* +NS_NewPermanentAtom(const nsACString& aUTF8String) +{ + AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aUTF8String).get()); if (he->HasValue() && he->IsStaticAtom()) return he->GetStaticAtomWrapper(); @@ -587,7 +736,7 @@ nsIAtom* NS_NewPermanentAtom( const nsACString& aString ) } } else { // otherwise, make a new atom - atom = new (aString) PermanentAtomImpl(); + atom = new (aUTF8String) PermanentAtomImpl(); he->SetAtomImpl(atom); if ( !atom ) { PL_DHashTableRawRemove(&gAtomTable, he); @@ -599,17 +748,20 @@ nsIAtom* NS_NewPermanentAtom( const nsACString& aString ) return atom; } -NS_COM nsIAtom* NS_NewAtom( const PRUnichar* str ) +NS_COM nsIAtom* +NS_NewPermanentAtom(const nsAString& aUTF16String) { - return NS_NewAtom(NS_ConvertUCS2toUTF8(str)); + return NS_NewPermanentAtom(NS_ConvertUTF16toUTF8(aUTF16String)); } -NS_COM nsIAtom* NS_NewPermanentAtom( const PRUnichar* str ) +NS_COM nsIAtom* +NS_NewPermanentAtom(const PRUnichar* aUTF16String) { - return NS_NewPermanentAtom(nsDependentString(str)); + return NS_NewPermanentAtom(NS_ConvertUTF16toUTF8(aUTF16String)); } -NS_COM nsrefcnt NS_GetNumberOfAtoms(void) +NS_COM nsrefcnt +NS_GetNumberOfAtoms(void) { return gAtomTable.entryCount; } diff --git a/xpcom/ds/nsCRT.cpp b/xpcom/ds/nsCRT.cpp index 600af37e80f8..ca716f4e6d53 100644 --- a/xpcom/ds/nsCRT.cpp +++ b/xpcom/ds/nsCRT.cpp @@ -300,6 +300,100 @@ PRUint32 nsCRT::HashCode(const PRUnichar* str, PRUint32* resultingStrLen) return h; } +PRUint32 nsCRT::HashCodeAsUTF8(const PRUnichar* str, PRUint32* resultingStrLen) +{ + PRUint32 h = 0; + const PRUnichar* s = str; + + { + PRUint16 W1 = 0; // the first UTF-16 word in a two word tuple + PRUint32 U = 0; // the current char as UCS-4 + int code_length = 0; // the number of bytes in the UTF-8 sequence for the current char + + PRUint16 W; + while ( (W = *s++) ) + { + /* + * On the fly, decoding from UTF-16 (and/or UCS-2) into UTF-8 as per + * http://www.ietf.org/rfc/rfc2781.txt + * http://www.ietf.org/rfc/rfc2279.txt + */ + + if ( !W1 ) + { + if ( W < 0xD800 || 0xDFFF < W ) + { + U = W; + if ( W <= 0x007F ) + code_length = 1; + else if ( W <= 0x07FF ) + code_length = 2; + else + code_length = 3; + } + else if ( /* 0xD800 <= W1 && */ W <= 0xDBFF ) + W1 = W; +#ifdef DEBUG + else NS_ERROR("Got low surrogate but no previous high surrogate"); +#endif + } + else + { + // as required by the standard, this code is careful to + // throw out illegal sequences + + if ( 0xDC00 <= W && W <= 0xDFFF ) + { + U = PRUint32( (W1&0x03FF)<<10 | (W&0x3FFF) ); + if ( U <= 0x001FFFFF ) + code_length = 4; + else if ( U <= 0x3FFFFFF ) + code_length = 5; + else + code_length = 6; + } +#ifdef DEBUG + else NS_ERROR("High surrogate not followed by low surrogate"); +#endif + W1 = 0; + } + + + if ( code_length > 0 ) + { + static const PRUint16 sBytePrefix[7] = { 0x0000, 0x0000, 0x00C0, 0x00E0, 0x00F0, 0x00F8, 0x00FC }; + static const PRUint16 sShift[7] = { 0, 0, 6, 12, 18, 24, 30 }; + + /* + * Unlike the algorithm in http://www.ietf.org/rfc/rfc2279.txt + * we must calculate the bytes in left to right order so that + * our hash result matches what the narrow version would calculate + * on an already UTF-8 string. + */ + + // hash the first (and often, only, byte) + h = (h>>28) ^ (h<<4) ^ (sBytePrefix[code_length] | (U>>sShift[code_length])); + + // an unrolled loop for hashing any remaining bytes in this sequence + switch ( code_length ) + { // falling through in each case + case 6: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>24) & 0x003F)); + case 5: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>18) & 0x003F)); + case 4: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>12) & 0x003F)); + case 3: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>6 ) & 0x003F)); + case 2: h = (h>>28) ^ (h<<4) ^ (0x80 | ( U & 0x003F)); + default: code_length = 0; + break; + } + } + } + } + + if ( resultingStrLen ) + *resultingStrLen = (s-str)-1; + return h; +} + PRUint32 nsCRT::BufferHashCode(const PRUnichar* s, PRUint32 len) { PRUint32 h = 0; diff --git a/xpcom/ds/nsCRT.h b/xpcom/ds/nsCRT.h index 0287b1ec4e0c..c620ab097d38 100644 --- a/xpcom/ds/nsCRT.h +++ b/xpcom/ds/nsCRT.h @@ -228,6 +228,12 @@ public: static PRUint32 HashCode(const PRUnichar* str, PRUint32* resultingStrLen = nsnull); + // Computes a hashcode for a ucs2 string that returns the same thing + // as the HashCode method taking a |char*| would if the string were + // converted to UTF8. Returns the string length as an added bonus. + static PRUint32 HashCodeAsUTF8(const PRUnichar* str, + PRUint32* resultingStrLen = nsnull); + // Computes the hashcode for a buffer with a specified length. static PRUint32 BufferHashCode(const PRUnichar* str, PRUint32 strLen); diff --git a/xpcom/string/public/nsReadableUtils.h b/xpcom/string/public/nsReadableUtils.h index f20f9dbb2ce5..799e846566cb 100755 --- a/xpcom/string/public/nsReadableUtils.h +++ b/xpcom/string/public/nsReadableUtils.h @@ -364,4 +364,16 @@ NS_COM const nsAFlatString& EmptyString(); NS_COM const nsAFlatCString& EmptyCString(); + /** + * Compare a UTF-8 string to an UTF-16 string. + * + * Returns 0 if the strings are equal, -1 if aUTF8String is less + * than aUTF16Count, and 1 in the reverse case. In case of fatal + * error (eg the strings are not valid UTF8 and UTF16 respectively), + * this method will return PR_INT32_MIN. + */ +NS_COM PRInt32 +CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String, + const nsASingleFragmentString& aUTF16String); + #endif // !defined(nsReadableUtils_h___) diff --git a/xpcom/string/public/nsUTF8Utils.h b/xpcom/string/public/nsUTF8Utils.h index 194b59cca958..6511389f3328 100644 --- a/xpcom/string/public/nsUTF8Utils.h +++ b/xpcom/string/public/nsUTF8Utils.h @@ -35,7 +35,6 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ - #ifndef nsUTF8Utils_h_ #define nsUTF8Utils_h_ @@ -60,6 +59,349 @@ class UTF8traits #define NS_ALWAYS_INLINE #endif +/** + * Extract the next UCS-4 character from the buffer and return it. The + * pointer passed in is advanced to the start of the next character in the + * buffer. If non-null, the parameters err and overlong are filled in to + * indicate that the character was represented by an overlong sequence, or + * that an error occurred. + */ + +class UTF8CharEnumerator +{ +public: + static PRUint32 NextChar(const char **buffer, const char *end, + PRBool *err = nsnull, PRBool* overlong = nsnull) + { + NS_ASSERTION(buffer && *buffer, "null buffer!"); + + const char *p = *buffer; + + if (p >= end) + { + if (err) + *err = PR_TRUE; + + return 0; + } + + char c = *p++; + + if ( UTF8traits::isASCII(c) ) + { + if (err) + *err = PR_FALSE; + if (overlong) + *overlong = PR_FALSE; + *buffer = p; + return c; + } + + PRUint32 ucs4; + PRUint32 minUcs4; + PRInt32 state = 0; + + if (!CalcState(c, ucs4, minUcs4, state)) { + NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); + if (err) + *err = PR_TRUE; + return 0; + } + + while ( state-- ) + { + if (p == end) + { + if (err) + *err = PR_TRUE; + + return 0; + } + + c = *p++; + + if (!AddByte(c, state, ucs4)) + { + NS_ERROR("not a UTF8 string"); + if (err) + *err = PR_TRUE; + return 0; + } + } + + if (err) + *err = PR_FALSE; + if (overlong) + *overlong = ucs4 < minUcs4; + *buffer = p; + return ucs4; + } + + static PRUint32 NextChar(nsACString::const_iterator& iter, + const nsACString::const_iterator& end, + PRBool *err = nsnull, PRBool *overlong = nsnull) + { + if ( iter == end ) + { + NS_ERROR("No input to work with"); + if (err) + *err = PR_TRUE; + + return 0; + } + + char c = *iter++; + + if ( UTF8traits::isASCII(c) ) + { + if (err) + *err = PR_FALSE; + if (overlong) + *overlong = PR_FALSE; + return c; + } + + PRUint32 ucs4; + PRUint32 minUcs4; + PRInt32 state = 0; + + if (!CalcState(c, ucs4, minUcs4, state)) { + NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); + if (err) + *err = PR_TRUE; + return 0; + } + + while ( state-- ) + { + if (iter == end) + { + NS_ERROR("Buffer ended in the middle of a multibyte sequence"); + if (err) + *err = PR_TRUE; + + return 0; + } + + c = *iter++; + + if (!AddByte(c, state, ucs4)) + { + NS_ERROR("not a UTF8 string"); + if (err) + *err = PR_TRUE; + return 0; + } + } + + if (err) + *err = PR_FALSE; + if (overlong) + *overlong = ucs4 < minUcs4; + return ucs4; + } + +private: + static PRBool CalcState(char c, PRUint32& ucs4, PRUint32& minUcs4, + PRInt32& state) + { + if ( UTF8traits::is2byte(c) ) + { + ucs4 = (PRUint32(c) << 6) & 0x000007C0L; + state = 1; + minUcs4 = 0x00000080; + } + else if ( UTF8traits::is3byte(c) ) + { + ucs4 = (PRUint32(c) << 12) & 0x0000F000L; + state = 2; + minUcs4 = 0x00000800; + } + else if ( UTF8traits::is4byte(c) ) + { + ucs4 = (PRUint32(c) << 18) & 0x001F0000L; + state = 3; + minUcs4 = 0x00010000; + } + else if ( UTF8traits::is5byte(c) ) + { + ucs4 = (PRUint32(c) << 24) & 0x03000000L; + state = 4; + minUcs4 = 0x00200000; + } + else if ( UTF8traits::is6byte(c) ) + { + ucs4 = (PRUint32(c) << 30) & 0x40000000L; + state = 5; + minUcs4 = 0x04000000; + } + else + { + return PR_FALSE; + } + + return PR_TRUE; + } + + static PRBool AddByte(char c, PRInt32 state, PRUint32& ucs4) + { + if ( UTF8traits::isInSeq(c) ) + { + PRInt32 shift = state * 6; + ucs4 |= (PRUint32(c) & 0x3F) << shift; + return PR_TRUE; + } + + return PR_FALSE; + } +}; + + +/** + * Extract the next UCS-4 character from the buffer and return it. The + * pointer passed in is advanced to the start of the next character in the + * buffer. If non-null, the err parameter is filled in if an error occurs. + */ + + +class UTF16CharEnumerator +{ +public: + static PRUint32 NextChar(const PRUnichar **buffer, const PRUnichar *end, + PRBool *err = nsnull) + { + NS_ASSERTION(buffer && *buffer, "null buffer!"); + + const PRUnichar *p = *buffer; + + if (p >= end) + { + NS_ERROR("No input to work with"); + if (err) + *err = PR_TRUE; + + return 0; + } + + PRUnichar c = *p++; + + if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF + { + if (err) + *err = PR_FALSE; + *buffer = p; + return c; + } + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + { + if (*buffer == end) + { + NS_ERROR("Unexpected end of buffer after high surrogate"); + if (err) + *err = PR_TRUE; + + return 0; + } + + // D800- DBFF - High Surrogate + // N = (H- D800) *400 + 10000 + ... + PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10); + + c = *p++; + + if (0xDC00 == (0xFC00 & c)) + { + // DC00- DFFF - Low Surrogate + // N += ( L - DC00 ) + ucs4 |= (0x03FF & c); + if (err) + *err = PR_FALSE; + *buffer = p; + return ucs4; + } + else + { + NS_ERROR("got a High Surrogate but no low surrogate"); + // output nothing. + } + } + else // U+DC00 - U+DFFF + { + // DC00- DFFF - Low Surrogate + NS_ERROR("got a low Surrogate but no high surrogate"); + // output nothing. + } + + if (err) + *err = PR_TRUE; + return 0; + } + + static PRUint32 NextChar(nsAString::const_iterator& iter, + const nsAString::const_iterator& end, + PRBool *err = nsnull) + { + if (iter == end) + { + if (err) + *err = PR_TRUE; + + return 0; + } + + PRUnichar c = *iter++; + + if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF + { + if (err) + *err = PR_FALSE; + return c; + } + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + { + if (iter == end) + { + if (err) + *err = PR_TRUE; + + return 0; + } + + // D800- DBFF - High Surrogate + // N = (H- D800) *400 + 10000 + ... + PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10); + + c = *iter++; + + if (0xDC00 == (0xFC00 & c)) + { + // DC00- DFFF - Low Surrogate + // N += ( L - DC00 ) + ucs4 |= (0x03FF & c); + if (err) + *err = PR_FALSE; + return ucs4; + } + else + { + NS_ERROR("got a High Surrogate but no low surrogate"); + // output nothing. + } + } + else // U+DC00 - U+DFFF + { + // DC00- DFFF - Low Surrogate + NS_ERROR("got a low Surrogate but no high surrogate"); + // output nothing. + } + + if (err) + *err = PR_TRUE; + return 0; + } +}; + + /** * A character sink (see |copy_string| in nsAlgorithm.h) for converting * UTF-8 to UTF-16 @@ -87,75 +429,18 @@ class ConvertUTF8toUTF16 buffer_type* out = mBuffer; for ( ; p != end /* && *p */; ) { - char c = *p++; + PRBool overlong, err; + PRUint32 ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err, + &overlong); - if ( UTF8traits::isASCII(c) ) + if ( err ) { - *out++ = buffer_type(c); - continue; - } - - PRUint32 ucs4; - PRUint32 minUcs4; - PRInt32 state = 0; - - if ( UTF8traits::is2byte(c) ) - { - ucs4 = (PRUint32(c) << 6) & 0x000007C0L; - state = 1; - minUcs4 = 0x00000080; - } - else if ( UTF8traits::is3byte(c) ) - { - ucs4 = (PRUint32(c) << 12) & 0x0000F000L; - state = 2; - minUcs4 = 0x00000800; - } - else if ( UTF8traits::is4byte(c) ) - { - ucs4 = (PRUint32(c) << 18) & 0x001F0000L; - state = 3; - minUcs4 = 0x00010000; - } - else if ( UTF8traits::is5byte(c) ) - { - ucs4 = (PRUint32(c) << 24) & 0x03000000L; - state = 4; - minUcs4 = 0x00200000; - } - else if ( UTF8traits::is6byte(c) ) - { - ucs4 = (PRUint32(c) << 30) & 0x40000000L; - state = 5; - minUcs4 = 0x04000000; - } - else - { - NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); mErrorEncountered = PR_TRUE; mBuffer = out; return N; } - while ( state-- ) - { - c = *p++; - - if ( UTF8traits::isInSeq(c) ) - { - PRInt32 shift = state * 6; - ucs4 |= (PRUint32(c) & 0x3F) << shift; - } - else - { - NS_ERROR("not a UTF8 string"); - mErrorEncountered = PR_TRUE; - mBuffer = out; - return N; - } - } - - if ( ucs4 < minUcs4 ) + if ( overlong ) { // Overlong sequence *out++ = UCS2_REPLACEMENT_CHAR; diff --git a/xpcom/string/src/nsReadableUtils.cpp b/xpcom/string/src/nsReadableUtils.cpp index 3d56e4fd3686..c10def95e752 100755 --- a/xpcom/string/src/nsReadableUtils.cpp +++ b/xpcom/string/src/nsReadableUtils.cpp @@ -1081,16 +1081,88 @@ StringEndsWith( const nsACString& aSource, const nsACString& aSubstring, static const PRUnichar empty_buffer[1] = { '\0' }; -NS_COM const nsAFlatString& EmptyString() +NS_COM +const nsAFlatString& +EmptyString() { static const nsDependentString sEmpty(empty_buffer); return sEmpty; } -NS_COM const nsAFlatCString& EmptyCString() +NS_COM +const nsAFlatCString& +EmptyCString() { static const nsDependentCString sEmpty((const char *)empty_buffer); return sEmpty; } + +NS_COM PRInt32 +CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String, + const nsASingleFragmentString& aUTF16String) + { + static const PRUint32 NOT_ASCII = PRUint32(~0x7F); + + const char *u8, *u8end; + aUTF8String.BeginReading(u8); + aUTF8String.EndReading(u8end); + + const PRUnichar *u16, *u16end; + aUTF16String.BeginReading(u16); + aUTF16String.EndReading(u16end); + + while (u8 != u8end && u16 != u16end) + { + // Cast away the signedness of *u8 to prevent signextension when + // converting to PRUint32 + PRUint32 c8_32 = (PRUint8)*u8; + + if (c8_32 & NOT_ASCII) + { + PRBool err; + c8_32 = UTF8CharEnumerator::NextChar(&u8, u8end, &err); + if (err) + return PR_INT32_MIN; + + PRUint32 c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end, + &err); + if (err) + return PR_INT32_MIN; + + if (c8_32 != c16_32) + return c8_32 < c16_32 ? -1 : 1; + } + else + { + if (c8_32 != *u16) + return c8_32 > *u16 ? 1 : -1; + + ++u8; + ++u16; + } + } + + if (u8 != u8end) + { + // We get to the end of the UTF16 string, but no to the end of + // the UTF8 string. The UTF8 string is longer than the UTF16 + // string + + return 1; + } + + if (u16 != u16end) + { + // We get to the end of the UTF8 string, but no to the end of + // the UTF16 string. The UTF16 string is longer than the UTF8 + // string + + return -1; + } + + // The two strings match. + + return 0; + }