mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-29 07:42:04 +00:00
Implement a in-place (no copy) CompareUTF8toUTF16, and use it to make the atom
hashtable lookups zero-copy. Patch by jst, bug 314465 (with lots of the discussion in bug 277479), r=bsmedberg,dbaron,brendan (on the PLDHashTable keyhash value assumptions), sr=bzbarsky, moa=shaver.
This commit is contained in:
parent
e7a8486231
commit
aee1056ad7
@ -43,7 +43,6 @@
|
||||
#include "nsCRT.h"
|
||||
#include "pldhash.h"
|
||||
#include "prenv.h"
|
||||
#include "nsVoidArray.h"
|
||||
|
||||
#define PL_ARENA_CONST_ALIGN_MASK 3
|
||||
#include "plarena.h"
|
||||
@ -94,65 +93,140 @@ private:
|
||||
const nsStaticAtom* mStaticAtom;
|
||||
};
|
||||
|
||||
// the atomtableentry can contain either an AtomImpl or a
|
||||
// nsStaticAtomWrapper, indicated by the first bit of PtrBits
|
||||
typedef unsigned long PtrBits;
|
||||
// The |key| pointer in the various PLDHashTable callbacks we use is an
|
||||
// AtomTableClearEntry*. These pointers can come from two places: either a
|
||||
// (probably stack-allocated) string key being passed to PL_DHashTableOperate,
|
||||
// or an actual entry in the atom table. PLDHashTable reseves the keyHash
|
||||
// values 0 and 1 for internal use, which means that the *PLDHashTable code*
|
||||
// will never pass an entry whose keyhash is 0 or 1 to our hooks. That means we
|
||||
// can use those values to tell whether an AtomTableEntry is a string key
|
||||
// created by a PLDHashTable code caller or an actual live AtomTableEntry used
|
||||
// by our PLDHashTable.
|
||||
//
|
||||
// Evil? Yes, but kinda neat too :-)
|
||||
//
|
||||
// An AtomTableEntry is a UTF-8 string key if keyHash is 0, in that
|
||||
// case mBits points to a UTF-8 encoded char *. If keyHash is 1 the
|
||||
// AtomTableEntry is a UTF-16 encoded string key and mBits points to a
|
||||
// UTF-16 encoded PRUnichar *.
|
||||
//
|
||||
// If keyHash is any other value (> 1), the AtomTableEntry is an
|
||||
// actual live entry in the table, and then mBits & ~0x1 in the
|
||||
// AtomTableEntry points to an AtomImpl or a nsStaticAtomWrapper,
|
||||
// indicated by the first bit of PtrBits.
|
||||
typedef PRUword PtrBits;
|
||||
|
||||
struct AtomTableEntry : public PLDHashEntryHdr {
|
||||
// mAtom & 0x1 means (mAtom & ~0x1) points to an nsStaticAtomWrapper
|
||||
// else it points to an nsAtomImpl
|
||||
PtrBits mAtom;
|
||||
// If keyHash > 1, mBits & 0x1 means (mBits & ~0x1) points to an
|
||||
// nsStaticAtomWrapper else it points to an nsAtomImpl
|
||||
PtrBits mBits;
|
||||
|
||||
inline AtomTableEntry(const char *aString)
|
||||
: mBits(PtrBits(aString))
|
||||
{
|
||||
keyHash = 0;
|
||||
}
|
||||
|
||||
inline AtomTableEntry(const PRUnichar *aString)
|
||||
: mBits(PtrBits(aString))
|
||||
{
|
||||
keyHash = 1;
|
||||
}
|
||||
|
||||
inline PRBool IsStaticAtom() const {
|
||||
return (mAtom & 0x1) != 0;
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"IsStaticAtom() called on non-atom AtomTableEntry!");
|
||||
return (mBits & 0x1) != 0;
|
||||
}
|
||||
|
||||
|
||||
inline PRBool IsUTF8String() const {
|
||||
return keyHash == 0;
|
||||
}
|
||||
|
||||
inline PRBool IsUTF16String() const {
|
||||
return keyHash == 1;
|
||||
}
|
||||
|
||||
inline void SetAtomImpl(AtomImpl* aAtom) {
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"SetAtomImpl() called on non-atom AtomTableEntry!");
|
||||
NS_ASSERTION(aAtom, "Setting null atom");
|
||||
mAtom = PtrBits(aAtom);
|
||||
mBits = PtrBits(aAtom);
|
||||
}
|
||||
|
||||
inline void SetStaticAtomWrapper(nsStaticAtomWrapper* aAtom) {
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"SetStaticAtomWrapper() called on non-atom AtomTableEntry!");
|
||||
NS_ASSERTION(aAtom, "Setting null atom");
|
||||
NS_ASSERTION((PtrBits(aAtom) & ~0x1) == PtrBits(aAtom),
|
||||
"Pointers must align or this is broken");
|
||||
|
||||
mAtom = PtrBits(aAtom) | 0x1;
|
||||
|
||||
mBits = PtrBits(aAtom) | 0x1;
|
||||
}
|
||||
|
||||
inline void ClearAtom() {
|
||||
mAtom = nsnull;
|
||||
mBits = nsnull;
|
||||
}
|
||||
|
||||
inline PRBool HasValue() const {
|
||||
return (mAtom & ~0x1) != 0;
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"HasValue() called on non-atom AtomTableEntry!");
|
||||
return (mBits & ~0x1) != 0;
|
||||
}
|
||||
|
||||
// these accessors assume that you already know the type
|
||||
inline AtomImpl *GetAtomImpl() const {
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"GetAtomImpl() called on non-atom AtomTableEntry!");
|
||||
NS_ASSERTION(!IsStaticAtom(), "This is a static atom, not an AtomImpl");
|
||||
return (AtomImpl*) (mAtom & ~0x1);
|
||||
return (AtomImpl*) (mBits & ~0x1);
|
||||
}
|
||||
|
||||
inline nsStaticAtomWrapper *GetStaticAtomWrapper() const {
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"GetStaticAtomWrapper() called on non-atom AtomTableEntry!");
|
||||
NS_ASSERTION(IsStaticAtom(), "This is an AtomImpl, not a static atom");
|
||||
return (nsStaticAtomWrapper*) (mAtom & ~0x1);
|
||||
return (nsStaticAtomWrapper*) (mBits & ~0x1);
|
||||
}
|
||||
|
||||
inline const nsStaticAtom* GetStaticAtom() const {
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"GetStaticAtom() called on non-atom AtomTableEntry!");
|
||||
return GetStaticAtomWrapper()->GetStaticAtom();
|
||||
}
|
||||
|
||||
// type-agnostic accessors
|
||||
|
||||
// get the string buffer
|
||||
inline const char* get() const {
|
||||
inline const char* getAtomString() const {
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"getAtomString() called on non-atom AtomTableEntry!");
|
||||
|
||||
return IsStaticAtom() ? GetStaticAtom()->mString : GetAtomImpl()->mString;
|
||||
}
|
||||
|
||||
// get the string buffer
|
||||
inline const char* getUTF8String() const {
|
||||
NS_ASSERTION(keyHash == 0,
|
||||
"getUTF8String() called on non-UTF8 AtomTableEntry!");
|
||||
|
||||
return (char *)mBits;
|
||||
}
|
||||
|
||||
// get the string buffer
|
||||
inline const PRUnichar* getUTF16String() const {
|
||||
NS_ASSERTION(keyHash == 1,
|
||||
"getUTF16String() called on non-UTF16 AtomTableEntry!");
|
||||
|
||||
return (PRUnichar *)mBits;
|
||||
}
|
||||
|
||||
// get an addreffed nsIAtom - not using already_AddRef'ed atom
|
||||
// because the callers are not (and should not be) using nsCOMPtr
|
||||
inline nsIAtom* GetAtom() const {
|
||||
NS_ASSERTION(keyHash > 1,
|
||||
"GetAtom() called on non-atom AtomTableEntry!");
|
||||
|
||||
nsIAtom* result;
|
||||
|
||||
if (IsStaticAtom())
|
||||
@ -171,17 +245,44 @@ AtomTableGetKey(PLDHashTable *table, PLDHashEntryHdr *entry)
|
||||
{
|
||||
AtomTableEntry *he = NS_STATIC_CAST(AtomTableEntry*, entry);
|
||||
NS_ASSERTION(he->HasValue(), "Empty atom. how did that happen?");
|
||||
return he->get();
|
||||
return he;
|
||||
}
|
||||
|
||||
PR_STATIC_CALLBACK(PLDHashNumber)
|
||||
AtomTableGetHash(PLDHashTable *table, const void *key)
|
||||
{
|
||||
const AtomTableEntry *e = NS_STATIC_CAST(const AtomTableEntry*, key);
|
||||
|
||||
if (e->IsUTF16String()) {
|
||||
return nsCRT::HashCodeAsUTF8(e->getUTF16String());
|
||||
}
|
||||
|
||||
NS_ASSERTION(e->IsUTF8String(),
|
||||
"AtomTableGetHash() called on non-string-key AtomTableEntry!");
|
||||
|
||||
return nsCRT::HashCode(e->getUTF8String());
|
||||
}
|
||||
|
||||
PR_STATIC_CALLBACK(PRBool)
|
||||
AtomTableMatchKey(PLDHashTable *table,
|
||||
const PLDHashEntryHdr *entry,
|
||||
AtomTableMatchKey(PLDHashTable *table, const PLDHashEntryHdr *entry,
|
||||
const void *key)
|
||||
{
|
||||
const AtomTableEntry *he = NS_STATIC_CAST(const AtomTableEntry*, entry);
|
||||
const char* keyStr = NS_STATIC_CAST(const char*, key);
|
||||
return nsCRT::strcmp(keyStr, he->get()) == 0;
|
||||
const AtomTableEntry *strKey = NS_STATIC_CAST(const AtomTableEntry*, key);
|
||||
|
||||
const char *atomString = he->getAtomString();
|
||||
|
||||
if (strKey->IsUTF16String()) {
|
||||
return
|
||||
CompareUTF8toUTF16(nsDependentCString(atomString),
|
||||
nsDependentString(strKey->getUTF16String())) == 0;
|
||||
}
|
||||
|
||||
if (strKey->IsUTF8String()) {
|
||||
return strcmp(atomString, strKey->getUTF8String()) == 0;
|
||||
}
|
||||
|
||||
return strcmp(atomString, strKey->getAtomString()) == 0;
|
||||
}
|
||||
|
||||
PR_STATIC_CALLBACK(void)
|
||||
@ -189,8 +290,6 @@ AtomTableClearEntry(PLDHashTable *table, PLDHashEntryHdr *entry)
|
||||
{
|
||||
AtomTableEntry *he = NS_STATIC_CAST(AtomTableEntry*, entry);
|
||||
|
||||
he->keyHash = 0;
|
||||
|
||||
if (!he->IsStaticAtom()) {
|
||||
AtomImpl *atom = he->GetAtomImpl();
|
||||
// Normal |AtomImpl| atoms are deleted when their refcount hits 0, and
|
||||
@ -199,8 +298,11 @@ AtomTableClearEntry(PLDHashTable *table, PLDHashEntryHdr *entry)
|
||||
// |PermanentAtomImpl| permanent atoms ignore their refcount and are
|
||||
// deleted when they are removed from the table at table destruction.
|
||||
// In other words, they are owned by the atom table.
|
||||
if (atom->IsPermanent())
|
||||
if (atom->IsPermanent()) {
|
||||
he->keyHash = 0;
|
||||
|
||||
delete NS_STATIC_CAST(PermanentAtomImpl*, atom);
|
||||
}
|
||||
}
|
||||
else {
|
||||
he->GetStaticAtomWrapper()->~nsStaticAtomWrapper();
|
||||
@ -213,7 +315,7 @@ static const PLDHashTableOps AtomTableOps = {
|
||||
PL_DHashAllocTable,
|
||||
PL_DHashFreeTable,
|
||||
AtomTableGetKey,
|
||||
PL_DHashStringKey,
|
||||
AtomTableGetHash,
|
||||
AtomTableMatchKey,
|
||||
PL_DHashMoveEntryStub,
|
||||
AtomTableClearEntry,
|
||||
@ -260,7 +362,8 @@ void PromoteToPermanent(AtomImpl* aAtom)
|
||||
aAtom = new (aAtom) PermanentAtomImpl();
|
||||
}
|
||||
|
||||
void NS_PurgeAtomTable()
|
||||
void
|
||||
NS_PurgeAtomTable()
|
||||
{
|
||||
if (gAtomTable.ops) {
|
||||
#ifdef DEBUG
|
||||
@ -295,7 +398,8 @@ AtomImpl::~AtomImpl()
|
||||
// don't want to remove them twice. See comment above in
|
||||
// |AtomTableClearEntry|.
|
||||
if (!IsPermanent()) {
|
||||
PL_DHashTableOperate(&gAtomTable, mString, PL_DHASH_REMOVE);
|
||||
AtomTableEntry key(mString);
|
||||
PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_REMOVE);
|
||||
if (gAtomTable.entryCount == 0) {
|
||||
PL_DHashTableFinish(&gAtomTable);
|
||||
NS_ASSERTION(gAtomTable.entryCount == 0,
|
||||
@ -388,7 +492,8 @@ AtomImpl::EqualsUTF8(const nsACString& aString, PRBool* aResult)
|
||||
NS_IMETHODIMP
|
||||
AtomImpl::Equals(const nsAString& aString, PRBool* aResult)
|
||||
{
|
||||
*aResult = NS_ConvertUTF16toUTF8(aString).Equals(mString);
|
||||
*aResult = CompareUTF8toUTF16(nsDependentCString(mString),
|
||||
PromiseFlatString(aString)) == 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
@ -445,21 +550,12 @@ nsStaticAtomWrapper::EqualsUTF8(const nsACString& aString, PRBool* aResult)
|
||||
NS_IMETHODIMP
|
||||
nsStaticAtomWrapper::Equals(const nsAString& aString, PRBool* aResult)
|
||||
{
|
||||
*aResult = NS_ConvertUCS2toUTF8(aString).Equals(mStaticAtom->mString);
|
||||
*aResult = CompareUTF8toUTF16(nsDependentCString(mStaticAtom->mString),
|
||||
PromiseFlatString(aString)) == 0;
|
||||
return NS_OK;
|
||||
}
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
NS_COM nsIAtom* NS_NewAtom(const char* isolatin1)
|
||||
{
|
||||
return NS_NewAtom(nsDependentCString(isolatin1));
|
||||
}
|
||||
|
||||
NS_COM nsIAtom* NS_NewPermanentAtom(const char* isolatin1)
|
||||
{
|
||||
return NS_NewPermanentAtom(NS_ConvertASCIItoUCS2(isolatin1));
|
||||
}
|
||||
|
||||
static nsStaticAtomWrapper*
|
||||
WrapStaticAtom(const nsStaticAtom* aAtom)
|
||||
{
|
||||
@ -480,7 +576,8 @@ WrapStaticAtom(const nsStaticAtom* aAtom)
|
||||
return wrapper;
|
||||
}
|
||||
|
||||
static AtomTableEntry* GetAtomHashEntry(const char* aString)
|
||||
static inline AtomTableEntry*
|
||||
GetAtomHashEntry(const char* aString)
|
||||
{
|
||||
if (!gAtomTable.ops &&
|
||||
!PL_DHashTableInit(&gAtomTable, &AtomTableOps, 0,
|
||||
@ -488,10 +585,25 @@ static AtomTableEntry* GetAtomHashEntry(const char* aString)
|
||||
gAtomTable.ops = nsnull;
|
||||
return nsnull;
|
||||
}
|
||||
|
||||
AtomTableEntry key(aString);
|
||||
return NS_STATIC_CAST(AtomTableEntry*,
|
||||
PL_DHashTableOperate(&gAtomTable,
|
||||
aString,
|
||||
PL_DHASH_ADD));
|
||||
PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_ADD));
|
||||
}
|
||||
|
||||
static inline AtomTableEntry*
|
||||
GetAtomHashEntry(const PRUnichar* aString)
|
||||
{
|
||||
if (!gAtomTable.ops &&
|
||||
!PL_DHashTableInit(&gAtomTable, &AtomTableOps, 0,
|
||||
sizeof(AtomTableEntry), 2048)) {
|
||||
gAtomTable.ops = nsnull;
|
||||
return nsnull;
|
||||
}
|
||||
|
||||
AtomTableEntry key(aString);
|
||||
return NS_STATIC_CAST(AtomTableEntry*,
|
||||
PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_ADD));
|
||||
}
|
||||
|
||||
NS_COM nsresult
|
||||
@ -499,7 +611,7 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
|
||||
{
|
||||
// this does two things:
|
||||
// 1) wraps each static atom in a wrapper, if necessary
|
||||
// 2) initializes the address pointed to by each mAtom slot
|
||||
// 2) initializes the address pointed to by each mBits slot
|
||||
|
||||
for (PRUint32 i=0; i<aAtomCount; i++) {
|
||||
NS_ASSERTION(nsCRT::IsAscii(aAtoms[i].mString),
|
||||
@ -509,7 +621,7 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
|
||||
|
||||
if (he->HasValue() && aAtoms[i].mAtom) {
|
||||
// there already is an atom with this name in the table.. but we
|
||||
// still have to update mAtom
|
||||
// still have to update mBits
|
||||
if (!he->IsStaticAtom() && !he->GetAtomImpl()->IsPermanent()) {
|
||||
// since we wanted to create a static atom but there is
|
||||
// already one there, we convert it to a non-refcounting
|
||||
@ -536,23 +648,25 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_COM nsIAtom* NS_NewAtom( const nsAString& aString )
|
||||
NS_COM nsIAtom*
|
||||
NS_NewAtom(const char* aUTF8String)
|
||||
{
|
||||
NS_ConvertUCS2toUTF8 utf8String(aString);
|
||||
AtomTableEntry *he = GetAtomHashEntry(aUTF8String);
|
||||
|
||||
return NS_NewAtom(utf8String);
|
||||
}
|
||||
if (!he) {
|
||||
return nsnull;
|
||||
}
|
||||
|
||||
NS_COM
|
||||
nsIAtom*
|
||||
NS_NewAtom( const nsACString& aString )
|
||||
{
|
||||
AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aString).get());
|
||||
NS_ASSERTION(!he->IsUTF8String() && !he->IsUTF16String(),
|
||||
"Atom hash entry is string? Should be atom!");
|
||||
|
||||
if (he->HasValue())
|
||||
return he->GetAtom();
|
||||
|
||||
AtomImpl* atom = new (aString) AtomImpl();
|
||||
// MSVC.NET doesn't like passing a temporary nsDependentCString() to
|
||||
// operator new, so declare one as a local instead.
|
||||
nsDependentCString str(aUTF8String);
|
||||
AtomImpl* atom = new (str) AtomImpl();
|
||||
he->SetAtomImpl(atom);
|
||||
if (!atom) {
|
||||
PL_DHashTableRawRemove(&gAtomTable, he);
|
||||
@ -563,15 +677,50 @@ NS_NewAtom( const nsACString& aString )
|
||||
return atom;
|
||||
}
|
||||
|
||||
NS_COM nsIAtom* NS_NewPermanentAtom( const nsAString& aString )
|
||||
NS_COM nsIAtom*
|
||||
NS_NewAtom(const nsACString& aUTF8String)
|
||||
{
|
||||
return NS_NewPermanentAtom(NS_ConvertUCS2toUTF8(aString));
|
||||
return NS_NewAtom(PromiseFlatCString(aUTF8String).get());
|
||||
}
|
||||
|
||||
NS_COM
|
||||
nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
|
||||
NS_COM nsIAtom*
|
||||
NS_NewAtom(const PRUnichar* aUTF16String)
|
||||
{
|
||||
AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aString).get());
|
||||
AtomTableEntry *he = GetAtomHashEntry(aUTF16String);
|
||||
|
||||
if (he->HasValue())
|
||||
return he->GetAtom();
|
||||
|
||||
// MSVC.NET doesn't like passing a temporary NS_ConvertUTF16toUTF8() to
|
||||
// operator new, so declare one as a local instead.
|
||||
NS_ConvertUTF16toUTF8 str(aUTF16String);
|
||||
AtomImpl* atom = new (str) AtomImpl();
|
||||
he->SetAtomImpl(atom);
|
||||
if (!atom) {
|
||||
PL_DHashTableRawRemove(&gAtomTable, he);
|
||||
return nsnull;
|
||||
}
|
||||
|
||||
NS_ADDREF(atom);
|
||||
return atom;
|
||||
}
|
||||
|
||||
NS_COM nsIAtom*
|
||||
NS_NewAtom(const nsAString& aUTF16String)
|
||||
{
|
||||
return NS_NewAtom(PromiseFlatString(aUTF16String).get());
|
||||
}
|
||||
|
||||
NS_COM nsIAtom*
|
||||
NS_NewPermanentAtom(const char* aUTF8String)
|
||||
{
|
||||
return NS_NewPermanentAtom(nsDependentCString(aUTF8String));
|
||||
}
|
||||
|
||||
NS_COM nsIAtom*
|
||||
NS_NewPermanentAtom(const nsACString& aUTF8String)
|
||||
{
|
||||
AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aUTF8String).get());
|
||||
|
||||
if (he->HasValue() && he->IsStaticAtom())
|
||||
return he->GetStaticAtomWrapper();
|
||||
@ -587,7 +736,7 @@ nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
|
||||
}
|
||||
} else {
|
||||
// otherwise, make a new atom
|
||||
atom = new (aString) PermanentAtomImpl();
|
||||
atom = new (aUTF8String) PermanentAtomImpl();
|
||||
he->SetAtomImpl(atom);
|
||||
if ( !atom ) {
|
||||
PL_DHashTableRawRemove(&gAtomTable, he);
|
||||
@ -599,17 +748,20 @@ nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
|
||||
return atom;
|
||||
}
|
||||
|
||||
NS_COM nsIAtom* NS_NewAtom( const PRUnichar* str )
|
||||
NS_COM nsIAtom*
|
||||
NS_NewPermanentAtom(const nsAString& aUTF16String)
|
||||
{
|
||||
return NS_NewAtom(NS_ConvertUCS2toUTF8(str));
|
||||
return NS_NewPermanentAtom(NS_ConvertUTF16toUTF8(aUTF16String));
|
||||
}
|
||||
|
||||
NS_COM nsIAtom* NS_NewPermanentAtom( const PRUnichar* str )
|
||||
NS_COM nsIAtom*
|
||||
NS_NewPermanentAtom(const PRUnichar* aUTF16String)
|
||||
{
|
||||
return NS_NewPermanentAtom(nsDependentString(str));
|
||||
return NS_NewPermanentAtom(NS_ConvertUTF16toUTF8(aUTF16String));
|
||||
}
|
||||
|
||||
NS_COM nsrefcnt NS_GetNumberOfAtoms(void)
|
||||
NS_COM nsrefcnt
|
||||
NS_GetNumberOfAtoms(void)
|
||||
{
|
||||
return gAtomTable.entryCount;
|
||||
}
|
||||
|
@ -300,6 +300,100 @@ PRUint32 nsCRT::HashCode(const PRUnichar* str, PRUint32* resultingStrLen)
|
||||
return h;
|
||||
}
|
||||
|
||||
PRUint32 nsCRT::HashCodeAsUTF8(const PRUnichar* str, PRUint32* resultingStrLen)
|
||||
{
|
||||
PRUint32 h = 0;
|
||||
const PRUnichar* s = str;
|
||||
|
||||
{
|
||||
PRUint16 W1 = 0; // the first UTF-16 word in a two word tuple
|
||||
PRUint32 U = 0; // the current char as UCS-4
|
||||
int code_length = 0; // the number of bytes in the UTF-8 sequence for the current char
|
||||
|
||||
PRUint16 W;
|
||||
while ( (W = *s++) )
|
||||
{
|
||||
/*
|
||||
* On the fly, decoding from UTF-16 (and/or UCS-2) into UTF-8 as per
|
||||
* http://www.ietf.org/rfc/rfc2781.txt
|
||||
* http://www.ietf.org/rfc/rfc2279.txt
|
||||
*/
|
||||
|
||||
if ( !W1 )
|
||||
{
|
||||
if ( W < 0xD800 || 0xDFFF < W )
|
||||
{
|
||||
U = W;
|
||||
if ( W <= 0x007F )
|
||||
code_length = 1;
|
||||
else if ( W <= 0x07FF )
|
||||
code_length = 2;
|
||||
else
|
||||
code_length = 3;
|
||||
}
|
||||
else if ( /* 0xD800 <= W1 && */ W <= 0xDBFF )
|
||||
W1 = W;
|
||||
#ifdef DEBUG
|
||||
else NS_ERROR("Got low surrogate but no previous high surrogate");
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// as required by the standard, this code is careful to
|
||||
// throw out illegal sequences
|
||||
|
||||
if ( 0xDC00 <= W && W <= 0xDFFF )
|
||||
{
|
||||
U = PRUint32( (W1&0x03FF)<<10 | (W&0x3FFF) );
|
||||
if ( U <= 0x001FFFFF )
|
||||
code_length = 4;
|
||||
else if ( U <= 0x3FFFFFF )
|
||||
code_length = 5;
|
||||
else
|
||||
code_length = 6;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
else NS_ERROR("High surrogate not followed by low surrogate");
|
||||
#endif
|
||||
W1 = 0;
|
||||
}
|
||||
|
||||
|
||||
if ( code_length > 0 )
|
||||
{
|
||||
static const PRUint16 sBytePrefix[7] = { 0x0000, 0x0000, 0x00C0, 0x00E0, 0x00F0, 0x00F8, 0x00FC };
|
||||
static const PRUint16 sShift[7] = { 0, 0, 6, 12, 18, 24, 30 };
|
||||
|
||||
/*
|
||||
* Unlike the algorithm in http://www.ietf.org/rfc/rfc2279.txt
|
||||
* we must calculate the bytes in left to right order so that
|
||||
* our hash result matches what the narrow version would calculate
|
||||
* on an already UTF-8 string.
|
||||
*/
|
||||
|
||||
// hash the first (and often, only, byte)
|
||||
h = (h>>28) ^ (h<<4) ^ (sBytePrefix[code_length] | (U>>sShift[code_length]));
|
||||
|
||||
// an unrolled loop for hashing any remaining bytes in this sequence
|
||||
switch ( code_length )
|
||||
{ // falling through in each case
|
||||
case 6: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>24) & 0x003F));
|
||||
case 5: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>18) & 0x003F));
|
||||
case 4: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>12) & 0x003F));
|
||||
case 3: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>6 ) & 0x003F));
|
||||
case 2: h = (h>>28) ^ (h<<4) ^ (0x80 | ( U & 0x003F));
|
||||
default: code_length = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( resultingStrLen )
|
||||
*resultingStrLen = (s-str)-1;
|
||||
return h;
|
||||
}
|
||||
|
||||
PRUint32 nsCRT::BufferHashCode(const PRUnichar* s, PRUint32 len)
|
||||
{
|
||||
PRUint32 h = 0;
|
||||
|
@ -228,6 +228,12 @@ public:
|
||||
static PRUint32 HashCode(const PRUnichar* str,
|
||||
PRUint32* resultingStrLen = nsnull);
|
||||
|
||||
// Computes a hashcode for a ucs2 string that returns the same thing
|
||||
// as the HashCode method taking a |char*| would if the string were
|
||||
// converted to UTF8. Returns the string length as an added bonus.
|
||||
static PRUint32 HashCodeAsUTF8(const PRUnichar* str,
|
||||
PRUint32* resultingStrLen = nsnull);
|
||||
|
||||
// Computes the hashcode for a buffer with a specified length.
|
||||
static PRUint32 BufferHashCode(const PRUnichar* str, PRUint32 strLen);
|
||||
|
||||
|
@ -364,4 +364,16 @@ NS_COM const nsAFlatString& EmptyString();
|
||||
NS_COM const nsAFlatCString& EmptyCString();
|
||||
|
||||
|
||||
/**
|
||||
* Compare a UTF-8 string to an UTF-16 string.
|
||||
*
|
||||
* Returns 0 if the strings are equal, -1 if aUTF8String is less
|
||||
* than aUTF16Count, and 1 in the reverse case. In case of fatal
|
||||
* error (eg the strings are not valid UTF8 and UTF16 respectively),
|
||||
* this method will return PR_INT32_MIN.
|
||||
*/
|
||||
NS_COM PRInt32
|
||||
CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
|
||||
const nsASingleFragmentString& aUTF16String);
|
||||
|
||||
#endif // !defined(nsReadableUtils_h___)
|
||||
|
@ -35,7 +35,6 @@
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#ifndef nsUTF8Utils_h_
|
||||
#define nsUTF8Utils_h_
|
||||
|
||||
@ -60,6 +59,349 @@ class UTF8traits
|
||||
#define NS_ALWAYS_INLINE
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Extract the next UCS-4 character from the buffer and return it. The
|
||||
* pointer passed in is advanced to the start of the next character in the
|
||||
* buffer. If non-null, the parameters err and overlong are filled in to
|
||||
* indicate that the character was represented by an overlong sequence, or
|
||||
* that an error occurred.
|
||||
*/
|
||||
|
||||
class UTF8CharEnumerator
|
||||
{
|
||||
public:
|
||||
static PRUint32 NextChar(const char **buffer, const char *end,
|
||||
PRBool *err = nsnull, PRBool* overlong = nsnull)
|
||||
{
|
||||
NS_ASSERTION(buffer && *buffer, "null buffer!");
|
||||
|
||||
const char *p = *buffer;
|
||||
|
||||
if (p >= end)
|
||||
{
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char c = *p++;
|
||||
|
||||
if ( UTF8traits::isASCII(c) )
|
||||
{
|
||||
if (err)
|
||||
*err = PR_FALSE;
|
||||
if (overlong)
|
||||
*overlong = PR_FALSE;
|
||||
*buffer = p;
|
||||
return c;
|
||||
}
|
||||
|
||||
PRUint32 ucs4;
|
||||
PRUint32 minUcs4;
|
||||
PRInt32 state = 0;
|
||||
|
||||
if (!CalcState(c, ucs4, minUcs4, state)) {
|
||||
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
while ( state-- )
|
||||
{
|
||||
if (p == end)
|
||||
{
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
c = *p++;
|
||||
|
||||
if (!AddByte(c, state, ucs4))
|
||||
{
|
||||
NS_ERROR("not a UTF8 string");
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (err)
|
||||
*err = PR_FALSE;
|
||||
if (overlong)
|
||||
*overlong = ucs4 < minUcs4;
|
||||
*buffer = p;
|
||||
return ucs4;
|
||||
}
|
||||
|
||||
static PRUint32 NextChar(nsACString::const_iterator& iter,
|
||||
const nsACString::const_iterator& end,
|
||||
PRBool *err = nsnull, PRBool *overlong = nsnull)
|
||||
{
|
||||
if ( iter == end )
|
||||
{
|
||||
NS_ERROR("No input to work with");
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char c = *iter++;
|
||||
|
||||
if ( UTF8traits::isASCII(c) )
|
||||
{
|
||||
if (err)
|
||||
*err = PR_FALSE;
|
||||
if (overlong)
|
||||
*overlong = PR_FALSE;
|
||||
return c;
|
||||
}
|
||||
|
||||
PRUint32 ucs4;
|
||||
PRUint32 minUcs4;
|
||||
PRInt32 state = 0;
|
||||
|
||||
if (!CalcState(c, ucs4, minUcs4, state)) {
|
||||
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
while ( state-- )
|
||||
{
|
||||
if (iter == end)
|
||||
{
|
||||
NS_ERROR("Buffer ended in the middle of a multibyte sequence");
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
c = *iter++;
|
||||
|
||||
if (!AddByte(c, state, ucs4))
|
||||
{
|
||||
NS_ERROR("not a UTF8 string");
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (err)
|
||||
*err = PR_FALSE;
|
||||
if (overlong)
|
||||
*overlong = ucs4 < minUcs4;
|
||||
return ucs4;
|
||||
}
|
||||
|
||||
private:
|
||||
static PRBool CalcState(char c, PRUint32& ucs4, PRUint32& minUcs4,
|
||||
PRInt32& state)
|
||||
{
|
||||
if ( UTF8traits::is2byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
|
||||
state = 1;
|
||||
minUcs4 = 0x00000080;
|
||||
}
|
||||
else if ( UTF8traits::is3byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
|
||||
state = 2;
|
||||
minUcs4 = 0x00000800;
|
||||
}
|
||||
else if ( UTF8traits::is4byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
|
||||
state = 3;
|
||||
minUcs4 = 0x00010000;
|
||||
}
|
||||
else if ( UTF8traits::is5byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
|
||||
state = 4;
|
||||
minUcs4 = 0x00200000;
|
||||
}
|
||||
else if ( UTF8traits::is6byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
|
||||
state = 5;
|
||||
minUcs4 = 0x04000000;
|
||||
}
|
||||
else
|
||||
{
|
||||
return PR_FALSE;
|
||||
}
|
||||
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
||||
static PRBool AddByte(char c, PRInt32 state, PRUint32& ucs4)
|
||||
{
|
||||
if ( UTF8traits::isInSeq(c) )
|
||||
{
|
||||
PRInt32 shift = state * 6;
|
||||
ucs4 |= (PRUint32(c) & 0x3F) << shift;
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
||||
return PR_FALSE;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Extract the next UCS-4 character from the buffer and return it. The
|
||||
* pointer passed in is advanced to the start of the next character in the
|
||||
* buffer. If non-null, the err parameter is filled in if an error occurs.
|
||||
*/
|
||||
|
||||
|
||||
class UTF16CharEnumerator
|
||||
{
|
||||
public:
|
||||
static PRUint32 NextChar(const PRUnichar **buffer, const PRUnichar *end,
|
||||
PRBool *err = nsnull)
|
||||
{
|
||||
NS_ASSERTION(buffer && *buffer, "null buffer!");
|
||||
|
||||
const PRUnichar *p = *buffer;
|
||||
|
||||
if (p >= end)
|
||||
{
|
||||
NS_ERROR("No input to work with");
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
PRUnichar c = *p++;
|
||||
|
||||
if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
|
||||
{
|
||||
if (err)
|
||||
*err = PR_FALSE;
|
||||
*buffer = p;
|
||||
return c;
|
||||
}
|
||||
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
||||
{
|
||||
if (*buffer == end)
|
||||
{
|
||||
NS_ERROR("Unexpected end of buffer after high surrogate");
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// D800- DBFF - High Surrogate
|
||||
// N = (H- D800) *400 + 10000 + ...
|
||||
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
|
||||
|
||||
c = *p++;
|
||||
|
||||
if (0xDC00 == (0xFC00 & c))
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
// N += ( L - DC00 )
|
||||
ucs4 |= (0x03FF & c);
|
||||
if (err)
|
||||
*err = PR_FALSE;
|
||||
*buffer = p;
|
||||
return ucs4;
|
||||
}
|
||||
else
|
||||
{
|
||||
NS_ERROR("got a High Surrogate but no low surrogate");
|
||||
// output nothing.
|
||||
}
|
||||
}
|
||||
else // U+DC00 - U+DFFF
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
NS_ERROR("got a low Surrogate but no high surrogate");
|
||||
// output nothing.
|
||||
}
|
||||
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PRUint32 NextChar(nsAString::const_iterator& iter,
|
||||
const nsAString::const_iterator& end,
|
||||
PRBool *err = nsnull)
|
||||
{
|
||||
if (iter == end)
|
||||
{
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
PRUnichar c = *iter++;
|
||||
|
||||
if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
|
||||
{
|
||||
if (err)
|
||||
*err = PR_FALSE;
|
||||
return c;
|
||||
}
|
||||
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
||||
{
|
||||
if (iter == end)
|
||||
{
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// D800- DBFF - High Surrogate
|
||||
// N = (H- D800) *400 + 10000 + ...
|
||||
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
|
||||
|
||||
c = *iter++;
|
||||
|
||||
if (0xDC00 == (0xFC00 & c))
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
// N += ( L - DC00 )
|
||||
ucs4 |= (0x03FF & c);
|
||||
if (err)
|
||||
*err = PR_FALSE;
|
||||
return ucs4;
|
||||
}
|
||||
else
|
||||
{
|
||||
NS_ERROR("got a High Surrogate but no low surrogate");
|
||||
// output nothing.
|
||||
}
|
||||
}
|
||||
else // U+DC00 - U+DFFF
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
NS_ERROR("got a low Surrogate but no high surrogate");
|
||||
// output nothing.
|
||||
}
|
||||
|
||||
if (err)
|
||||
*err = PR_TRUE;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
||||
* UTF-8 to UTF-16
|
||||
@ -87,75 +429,18 @@ class ConvertUTF8toUTF16
|
||||
buffer_type* out = mBuffer;
|
||||
for ( ; p != end /* && *p */; )
|
||||
{
|
||||
char c = *p++;
|
||||
PRBool overlong, err;
|
||||
PRUint32 ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err,
|
||||
&overlong);
|
||||
|
||||
if ( UTF8traits::isASCII(c) )
|
||||
if ( err )
|
||||
{
|
||||
*out++ = buffer_type(c);
|
||||
continue;
|
||||
}
|
||||
|
||||
PRUint32 ucs4;
|
||||
PRUint32 minUcs4;
|
||||
PRInt32 state = 0;
|
||||
|
||||
if ( UTF8traits::is2byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
|
||||
state = 1;
|
||||
minUcs4 = 0x00000080;
|
||||
}
|
||||
else if ( UTF8traits::is3byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
|
||||
state = 2;
|
||||
minUcs4 = 0x00000800;
|
||||
}
|
||||
else if ( UTF8traits::is4byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
|
||||
state = 3;
|
||||
minUcs4 = 0x00010000;
|
||||
}
|
||||
else if ( UTF8traits::is5byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
|
||||
state = 4;
|
||||
minUcs4 = 0x00200000;
|
||||
}
|
||||
else if ( UTF8traits::is6byte(c) )
|
||||
{
|
||||
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
|
||||
state = 5;
|
||||
minUcs4 = 0x04000000;
|
||||
}
|
||||
else
|
||||
{
|
||||
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
||||
mErrorEncountered = PR_TRUE;
|
||||
mBuffer = out;
|
||||
return N;
|
||||
}
|
||||
|
||||
while ( state-- )
|
||||
{
|
||||
c = *p++;
|
||||
|
||||
if ( UTF8traits::isInSeq(c) )
|
||||
{
|
||||
PRInt32 shift = state * 6;
|
||||
ucs4 |= (PRUint32(c) & 0x3F) << shift;
|
||||
}
|
||||
else
|
||||
{
|
||||
NS_ERROR("not a UTF8 string");
|
||||
mErrorEncountered = PR_TRUE;
|
||||
mBuffer = out;
|
||||
return N;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ucs4 < minUcs4 )
|
||||
if ( overlong )
|
||||
{
|
||||
// Overlong sequence
|
||||
*out++ = UCS2_REPLACEMENT_CHAR;
|
||||
|
@ -1081,16 +1081,88 @@ StringEndsWith( const nsACString& aSource, const nsACString& aSubstring,
|
||||
|
||||
static const PRUnichar empty_buffer[1] = { '\0' };
|
||||
|
||||
NS_COM const nsAFlatString& EmptyString()
|
||||
NS_COM
|
||||
const nsAFlatString&
|
||||
EmptyString()
|
||||
{
|
||||
static const nsDependentString sEmpty(empty_buffer);
|
||||
|
||||
return sEmpty;
|
||||
}
|
||||
|
||||
NS_COM const nsAFlatCString& EmptyCString()
|
||||
NS_COM
|
||||
const nsAFlatCString&
|
||||
EmptyCString()
|
||||
{
|
||||
static const nsDependentCString sEmpty((const char *)empty_buffer);
|
||||
|
||||
return sEmpty;
|
||||
}
|
||||
|
||||
NS_COM PRInt32
|
||||
CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
|
||||
const nsASingleFragmentString& aUTF16String)
|
||||
{
|
||||
static const PRUint32 NOT_ASCII = PRUint32(~0x7F);
|
||||
|
||||
const char *u8, *u8end;
|
||||
aUTF8String.BeginReading(u8);
|
||||
aUTF8String.EndReading(u8end);
|
||||
|
||||
const PRUnichar *u16, *u16end;
|
||||
aUTF16String.BeginReading(u16);
|
||||
aUTF16String.EndReading(u16end);
|
||||
|
||||
while (u8 != u8end && u16 != u16end)
|
||||
{
|
||||
// Cast away the signedness of *u8 to prevent signextension when
|
||||
// converting to PRUint32
|
||||
PRUint32 c8_32 = (PRUint8)*u8;
|
||||
|
||||
if (c8_32 & NOT_ASCII)
|
||||
{
|
||||
PRBool err;
|
||||
c8_32 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
|
||||
if (err)
|
||||
return PR_INT32_MIN;
|
||||
|
||||
PRUint32 c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end,
|
||||
&err);
|
||||
if (err)
|
||||
return PR_INT32_MIN;
|
||||
|
||||
if (c8_32 != c16_32)
|
||||
return c8_32 < c16_32 ? -1 : 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (c8_32 != *u16)
|
||||
return c8_32 > *u16 ? 1 : -1;
|
||||
|
||||
++u8;
|
||||
++u16;
|
||||
}
|
||||
}
|
||||
|
||||
if (u8 != u8end)
|
||||
{
|
||||
// We get to the end of the UTF16 string, but no to the end of
|
||||
// the UTF8 string. The UTF8 string is longer than the UTF16
|
||||
// string
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (u16 != u16end)
|
||||
{
|
||||
// We get to the end of the UTF8 string, but no to the end of
|
||||
// the UTF16 string. The UTF16 string is longer than the UTF8
|
||||
// string
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
// The two strings match.
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user