Implement a in-place (no copy) CompareUTF8toUTF16, and use it to make the atom

hashtable lookups zero-copy.  Patch by jst, bug 314465 (with lots of the
discussion in bug 277479), r=bsmedberg,dbaron,brendan (on the PLDHashTable
keyhash value assumptions), sr=bzbarsky, moa=shaver.
This commit is contained in:
bzbarsky%mit.edu 2005-11-04 19:52:18 +00:00
parent e7a8486231
commit aee1056ad7
6 changed files with 753 additions and 132 deletions

View File

@ -43,7 +43,6 @@
#include "nsCRT.h"
#include "pldhash.h"
#include "prenv.h"
#include "nsVoidArray.h"
#define PL_ARENA_CONST_ALIGN_MASK 3
#include "plarena.h"
@ -94,65 +93,140 @@ private:
const nsStaticAtom* mStaticAtom;
};
// the atomtableentry can contain either an AtomImpl or a
// nsStaticAtomWrapper, indicated by the first bit of PtrBits
typedef unsigned long PtrBits;
// The |key| pointer in the various PLDHashTable callbacks we use is an
// AtomTableClearEntry*. These pointers can come from two places: either a
// (probably stack-allocated) string key being passed to PL_DHashTableOperate,
// or an actual entry in the atom table. PLDHashTable reseves the keyHash
// values 0 and 1 for internal use, which means that the *PLDHashTable code*
// will never pass an entry whose keyhash is 0 or 1 to our hooks. That means we
// can use those values to tell whether an AtomTableEntry is a string key
// created by a PLDHashTable code caller or an actual live AtomTableEntry used
// by our PLDHashTable.
//
// Evil? Yes, but kinda neat too :-)
//
// An AtomTableEntry is a UTF-8 string key if keyHash is 0, in that
// case mBits points to a UTF-8 encoded char *. If keyHash is 1 the
// AtomTableEntry is a UTF-16 encoded string key and mBits points to a
// UTF-16 encoded PRUnichar *.
//
// If keyHash is any other value (> 1), the AtomTableEntry is an
// actual live entry in the table, and then mBits & ~0x1 in the
// AtomTableEntry points to an AtomImpl or a nsStaticAtomWrapper,
// indicated by the first bit of PtrBits.
typedef PRUword PtrBits;
struct AtomTableEntry : public PLDHashEntryHdr {
// mAtom & 0x1 means (mAtom & ~0x1) points to an nsStaticAtomWrapper
// else it points to an nsAtomImpl
PtrBits mAtom;
// If keyHash > 1, mBits & 0x1 means (mBits & ~0x1) points to an
// nsStaticAtomWrapper else it points to an nsAtomImpl
PtrBits mBits;
inline AtomTableEntry(const char *aString)
: mBits(PtrBits(aString))
{
keyHash = 0;
}
inline AtomTableEntry(const PRUnichar *aString)
: mBits(PtrBits(aString))
{
keyHash = 1;
}
inline PRBool IsStaticAtom() const {
return (mAtom & 0x1) != 0;
NS_ASSERTION(keyHash > 1,
"IsStaticAtom() called on non-atom AtomTableEntry!");
return (mBits & 0x1) != 0;
}
inline PRBool IsUTF8String() const {
return keyHash == 0;
}
inline PRBool IsUTF16String() const {
return keyHash == 1;
}
inline void SetAtomImpl(AtomImpl* aAtom) {
NS_ASSERTION(keyHash > 1,
"SetAtomImpl() called on non-atom AtomTableEntry!");
NS_ASSERTION(aAtom, "Setting null atom");
mAtom = PtrBits(aAtom);
mBits = PtrBits(aAtom);
}
inline void SetStaticAtomWrapper(nsStaticAtomWrapper* aAtom) {
NS_ASSERTION(keyHash > 1,
"SetStaticAtomWrapper() called on non-atom AtomTableEntry!");
NS_ASSERTION(aAtom, "Setting null atom");
NS_ASSERTION((PtrBits(aAtom) & ~0x1) == PtrBits(aAtom),
"Pointers must align or this is broken");
mAtom = PtrBits(aAtom) | 0x1;
mBits = PtrBits(aAtom) | 0x1;
}
inline void ClearAtom() {
mAtom = nsnull;
mBits = nsnull;
}
inline PRBool HasValue() const {
return (mAtom & ~0x1) != 0;
NS_ASSERTION(keyHash > 1,
"HasValue() called on non-atom AtomTableEntry!");
return (mBits & ~0x1) != 0;
}
// these accessors assume that you already know the type
inline AtomImpl *GetAtomImpl() const {
NS_ASSERTION(keyHash > 1,
"GetAtomImpl() called on non-atom AtomTableEntry!");
NS_ASSERTION(!IsStaticAtom(), "This is a static atom, not an AtomImpl");
return (AtomImpl*) (mAtom & ~0x1);
return (AtomImpl*) (mBits & ~0x1);
}
inline nsStaticAtomWrapper *GetStaticAtomWrapper() const {
NS_ASSERTION(keyHash > 1,
"GetStaticAtomWrapper() called on non-atom AtomTableEntry!");
NS_ASSERTION(IsStaticAtom(), "This is an AtomImpl, not a static atom");
return (nsStaticAtomWrapper*) (mAtom & ~0x1);
return (nsStaticAtomWrapper*) (mBits & ~0x1);
}
inline const nsStaticAtom* GetStaticAtom() const {
NS_ASSERTION(keyHash > 1,
"GetStaticAtom() called on non-atom AtomTableEntry!");
return GetStaticAtomWrapper()->GetStaticAtom();
}
// type-agnostic accessors
// get the string buffer
inline const char* get() const {
inline const char* getAtomString() const {
NS_ASSERTION(keyHash > 1,
"getAtomString() called on non-atom AtomTableEntry!");
return IsStaticAtom() ? GetStaticAtom()->mString : GetAtomImpl()->mString;
}
// get the string buffer
inline const char* getUTF8String() const {
NS_ASSERTION(keyHash == 0,
"getUTF8String() called on non-UTF8 AtomTableEntry!");
return (char *)mBits;
}
// get the string buffer
inline const PRUnichar* getUTF16String() const {
NS_ASSERTION(keyHash == 1,
"getUTF16String() called on non-UTF16 AtomTableEntry!");
return (PRUnichar *)mBits;
}
// get an addreffed nsIAtom - not using already_AddRef'ed atom
// because the callers are not (and should not be) using nsCOMPtr
inline nsIAtom* GetAtom() const {
NS_ASSERTION(keyHash > 1,
"GetAtom() called on non-atom AtomTableEntry!");
nsIAtom* result;
if (IsStaticAtom())
@ -171,17 +245,44 @@ AtomTableGetKey(PLDHashTable *table, PLDHashEntryHdr *entry)
{
AtomTableEntry *he = NS_STATIC_CAST(AtomTableEntry*, entry);
NS_ASSERTION(he->HasValue(), "Empty atom. how did that happen?");
return he->get();
return he;
}
PR_STATIC_CALLBACK(PLDHashNumber)
AtomTableGetHash(PLDHashTable *table, const void *key)
{
const AtomTableEntry *e = NS_STATIC_CAST(const AtomTableEntry*, key);
if (e->IsUTF16String()) {
return nsCRT::HashCodeAsUTF8(e->getUTF16String());
}
NS_ASSERTION(e->IsUTF8String(),
"AtomTableGetHash() called on non-string-key AtomTableEntry!");
return nsCRT::HashCode(e->getUTF8String());
}
PR_STATIC_CALLBACK(PRBool)
AtomTableMatchKey(PLDHashTable *table,
const PLDHashEntryHdr *entry,
AtomTableMatchKey(PLDHashTable *table, const PLDHashEntryHdr *entry,
const void *key)
{
const AtomTableEntry *he = NS_STATIC_CAST(const AtomTableEntry*, entry);
const char* keyStr = NS_STATIC_CAST(const char*, key);
return nsCRT::strcmp(keyStr, he->get()) == 0;
const AtomTableEntry *strKey = NS_STATIC_CAST(const AtomTableEntry*, key);
const char *atomString = he->getAtomString();
if (strKey->IsUTF16String()) {
return
CompareUTF8toUTF16(nsDependentCString(atomString),
nsDependentString(strKey->getUTF16String())) == 0;
}
if (strKey->IsUTF8String()) {
return strcmp(atomString, strKey->getUTF8String()) == 0;
}
return strcmp(atomString, strKey->getAtomString()) == 0;
}
PR_STATIC_CALLBACK(void)
@ -189,8 +290,6 @@ AtomTableClearEntry(PLDHashTable *table, PLDHashEntryHdr *entry)
{
AtomTableEntry *he = NS_STATIC_CAST(AtomTableEntry*, entry);
he->keyHash = 0;
if (!he->IsStaticAtom()) {
AtomImpl *atom = he->GetAtomImpl();
// Normal |AtomImpl| atoms are deleted when their refcount hits 0, and
@ -199,8 +298,11 @@ AtomTableClearEntry(PLDHashTable *table, PLDHashEntryHdr *entry)
// |PermanentAtomImpl| permanent atoms ignore their refcount and are
// deleted when they are removed from the table at table destruction.
// In other words, they are owned by the atom table.
if (atom->IsPermanent())
if (atom->IsPermanent()) {
he->keyHash = 0;
delete NS_STATIC_CAST(PermanentAtomImpl*, atom);
}
}
else {
he->GetStaticAtomWrapper()->~nsStaticAtomWrapper();
@ -213,7 +315,7 @@ static const PLDHashTableOps AtomTableOps = {
PL_DHashAllocTable,
PL_DHashFreeTable,
AtomTableGetKey,
PL_DHashStringKey,
AtomTableGetHash,
AtomTableMatchKey,
PL_DHashMoveEntryStub,
AtomTableClearEntry,
@ -260,7 +362,8 @@ void PromoteToPermanent(AtomImpl* aAtom)
aAtom = new (aAtom) PermanentAtomImpl();
}
void NS_PurgeAtomTable()
void
NS_PurgeAtomTable()
{
if (gAtomTable.ops) {
#ifdef DEBUG
@ -295,7 +398,8 @@ AtomImpl::~AtomImpl()
// don't want to remove them twice. See comment above in
// |AtomTableClearEntry|.
if (!IsPermanent()) {
PL_DHashTableOperate(&gAtomTable, mString, PL_DHASH_REMOVE);
AtomTableEntry key(mString);
PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_REMOVE);
if (gAtomTable.entryCount == 0) {
PL_DHashTableFinish(&gAtomTable);
NS_ASSERTION(gAtomTable.entryCount == 0,
@ -388,7 +492,8 @@ AtomImpl::EqualsUTF8(const nsACString& aString, PRBool* aResult)
NS_IMETHODIMP
AtomImpl::Equals(const nsAString& aString, PRBool* aResult)
{
*aResult = NS_ConvertUTF16toUTF8(aString).Equals(mString);
*aResult = CompareUTF8toUTF16(nsDependentCString(mString),
PromiseFlatString(aString)) == 0;
return NS_OK;
}
@ -445,21 +550,12 @@ nsStaticAtomWrapper::EqualsUTF8(const nsACString& aString, PRBool* aResult)
NS_IMETHODIMP
nsStaticAtomWrapper::Equals(const nsAString& aString, PRBool* aResult)
{
*aResult = NS_ConvertUCS2toUTF8(aString).Equals(mStaticAtom->mString);
*aResult = CompareUTF8toUTF16(nsDependentCString(mStaticAtom->mString),
PromiseFlatString(aString)) == 0;
return NS_OK;
}
//----------------------------------------------------------------------
NS_COM nsIAtom* NS_NewAtom(const char* isolatin1)
{
return NS_NewAtom(nsDependentCString(isolatin1));
}
NS_COM nsIAtom* NS_NewPermanentAtom(const char* isolatin1)
{
return NS_NewPermanentAtom(NS_ConvertASCIItoUCS2(isolatin1));
}
static nsStaticAtomWrapper*
WrapStaticAtom(const nsStaticAtom* aAtom)
{
@ -480,7 +576,8 @@ WrapStaticAtom(const nsStaticAtom* aAtom)
return wrapper;
}
static AtomTableEntry* GetAtomHashEntry(const char* aString)
static inline AtomTableEntry*
GetAtomHashEntry(const char* aString)
{
if (!gAtomTable.ops &&
!PL_DHashTableInit(&gAtomTable, &AtomTableOps, 0,
@ -488,10 +585,25 @@ static AtomTableEntry* GetAtomHashEntry(const char* aString)
gAtomTable.ops = nsnull;
return nsnull;
}
AtomTableEntry key(aString);
return NS_STATIC_CAST(AtomTableEntry*,
PL_DHashTableOperate(&gAtomTable,
aString,
PL_DHASH_ADD));
PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_ADD));
}
static inline AtomTableEntry*
GetAtomHashEntry(const PRUnichar* aString)
{
if (!gAtomTable.ops &&
!PL_DHashTableInit(&gAtomTable, &AtomTableOps, 0,
sizeof(AtomTableEntry), 2048)) {
gAtomTable.ops = nsnull;
return nsnull;
}
AtomTableEntry key(aString);
return NS_STATIC_CAST(AtomTableEntry*,
PL_DHashTableOperate(&gAtomTable, &key, PL_DHASH_ADD));
}
NS_COM nsresult
@ -499,7 +611,7 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
{
// this does two things:
// 1) wraps each static atom in a wrapper, if necessary
// 2) initializes the address pointed to by each mAtom slot
// 2) initializes the address pointed to by each mBits slot
for (PRUint32 i=0; i<aAtomCount; i++) {
NS_ASSERTION(nsCRT::IsAscii(aAtoms[i].mString),
@ -509,7 +621,7 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
if (he->HasValue() && aAtoms[i].mAtom) {
// there already is an atom with this name in the table.. but we
// still have to update mAtom
// still have to update mBits
if (!he->IsStaticAtom() && !he->GetAtomImpl()->IsPermanent()) {
// since we wanted to create a static atom but there is
// already one there, we convert it to a non-refcounting
@ -536,23 +648,25 @@ NS_RegisterStaticAtoms(const nsStaticAtom* aAtoms, PRUint32 aAtomCount)
return NS_OK;
}
NS_COM nsIAtom* NS_NewAtom( const nsAString& aString )
NS_COM nsIAtom*
NS_NewAtom(const char* aUTF8String)
{
NS_ConvertUCS2toUTF8 utf8String(aString);
AtomTableEntry *he = GetAtomHashEntry(aUTF8String);
return NS_NewAtom(utf8String);
}
if (!he) {
return nsnull;
}
NS_COM
nsIAtom*
NS_NewAtom( const nsACString& aString )
{
AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aString).get());
NS_ASSERTION(!he->IsUTF8String() && !he->IsUTF16String(),
"Atom hash entry is string? Should be atom!");
if (he->HasValue())
return he->GetAtom();
AtomImpl* atom = new (aString) AtomImpl();
// MSVC.NET doesn't like passing a temporary nsDependentCString() to
// operator new, so declare one as a local instead.
nsDependentCString str(aUTF8String);
AtomImpl* atom = new (str) AtomImpl();
he->SetAtomImpl(atom);
if (!atom) {
PL_DHashTableRawRemove(&gAtomTable, he);
@ -563,15 +677,50 @@ NS_NewAtom( const nsACString& aString )
return atom;
}
NS_COM nsIAtom* NS_NewPermanentAtom( const nsAString& aString )
NS_COM nsIAtom*
NS_NewAtom(const nsACString& aUTF8String)
{
return NS_NewPermanentAtom(NS_ConvertUCS2toUTF8(aString));
return NS_NewAtom(PromiseFlatCString(aUTF8String).get());
}
NS_COM
nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
NS_COM nsIAtom*
NS_NewAtom(const PRUnichar* aUTF16String)
{
AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aString).get());
AtomTableEntry *he = GetAtomHashEntry(aUTF16String);
if (he->HasValue())
return he->GetAtom();
// MSVC.NET doesn't like passing a temporary NS_ConvertUTF16toUTF8() to
// operator new, so declare one as a local instead.
NS_ConvertUTF16toUTF8 str(aUTF16String);
AtomImpl* atom = new (str) AtomImpl();
he->SetAtomImpl(atom);
if (!atom) {
PL_DHashTableRawRemove(&gAtomTable, he);
return nsnull;
}
NS_ADDREF(atom);
return atom;
}
NS_COM nsIAtom*
NS_NewAtom(const nsAString& aUTF16String)
{
return NS_NewAtom(PromiseFlatString(aUTF16String).get());
}
NS_COM nsIAtom*
NS_NewPermanentAtom(const char* aUTF8String)
{
return NS_NewPermanentAtom(nsDependentCString(aUTF8String));
}
NS_COM nsIAtom*
NS_NewPermanentAtom(const nsACString& aUTF8String)
{
AtomTableEntry *he = GetAtomHashEntry(PromiseFlatCString(aUTF8String).get());
if (he->HasValue() && he->IsStaticAtom())
return he->GetStaticAtomWrapper();
@ -587,7 +736,7 @@ nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
}
} else {
// otherwise, make a new atom
atom = new (aString) PermanentAtomImpl();
atom = new (aUTF8String) PermanentAtomImpl();
he->SetAtomImpl(atom);
if ( !atom ) {
PL_DHashTableRawRemove(&gAtomTable, he);
@ -599,17 +748,20 @@ nsIAtom* NS_NewPermanentAtom( const nsACString& aString )
return atom;
}
NS_COM nsIAtom* NS_NewAtom( const PRUnichar* str )
NS_COM nsIAtom*
NS_NewPermanentAtom(const nsAString& aUTF16String)
{
return NS_NewAtom(NS_ConvertUCS2toUTF8(str));
return NS_NewPermanentAtom(NS_ConvertUTF16toUTF8(aUTF16String));
}
NS_COM nsIAtom* NS_NewPermanentAtom( const PRUnichar* str )
NS_COM nsIAtom*
NS_NewPermanentAtom(const PRUnichar* aUTF16String)
{
return NS_NewPermanentAtom(nsDependentString(str));
return NS_NewPermanentAtom(NS_ConvertUTF16toUTF8(aUTF16String));
}
NS_COM nsrefcnt NS_GetNumberOfAtoms(void)
NS_COM nsrefcnt
NS_GetNumberOfAtoms(void)
{
return gAtomTable.entryCount;
}

View File

@ -300,6 +300,100 @@ PRUint32 nsCRT::HashCode(const PRUnichar* str, PRUint32* resultingStrLen)
return h;
}
PRUint32 nsCRT::HashCodeAsUTF8(const PRUnichar* str, PRUint32* resultingStrLen)
{
PRUint32 h = 0;
const PRUnichar* s = str;
{
PRUint16 W1 = 0; // the first UTF-16 word in a two word tuple
PRUint32 U = 0; // the current char as UCS-4
int code_length = 0; // the number of bytes in the UTF-8 sequence for the current char
PRUint16 W;
while ( (W = *s++) )
{
/*
* On the fly, decoding from UTF-16 (and/or UCS-2) into UTF-8 as per
* http://www.ietf.org/rfc/rfc2781.txt
* http://www.ietf.org/rfc/rfc2279.txt
*/
if ( !W1 )
{
if ( W < 0xD800 || 0xDFFF < W )
{
U = W;
if ( W <= 0x007F )
code_length = 1;
else if ( W <= 0x07FF )
code_length = 2;
else
code_length = 3;
}
else if ( /* 0xD800 <= W1 && */ W <= 0xDBFF )
W1 = W;
#ifdef DEBUG
else NS_ERROR("Got low surrogate but no previous high surrogate");
#endif
}
else
{
// as required by the standard, this code is careful to
// throw out illegal sequences
if ( 0xDC00 <= W && W <= 0xDFFF )
{
U = PRUint32( (W1&0x03FF)<<10 | (W&0x3FFF) );
if ( U <= 0x001FFFFF )
code_length = 4;
else if ( U <= 0x3FFFFFF )
code_length = 5;
else
code_length = 6;
}
#ifdef DEBUG
else NS_ERROR("High surrogate not followed by low surrogate");
#endif
W1 = 0;
}
if ( code_length > 0 )
{
static const PRUint16 sBytePrefix[7] = { 0x0000, 0x0000, 0x00C0, 0x00E0, 0x00F0, 0x00F8, 0x00FC };
static const PRUint16 sShift[7] = { 0, 0, 6, 12, 18, 24, 30 };
/*
* Unlike the algorithm in http://www.ietf.org/rfc/rfc2279.txt
* we must calculate the bytes in left to right order so that
* our hash result matches what the narrow version would calculate
* on an already UTF-8 string.
*/
// hash the first (and often, only, byte)
h = (h>>28) ^ (h<<4) ^ (sBytePrefix[code_length] | (U>>sShift[code_length]));
// an unrolled loop for hashing any remaining bytes in this sequence
switch ( code_length )
{ // falling through in each case
case 6: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>24) & 0x003F));
case 5: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>18) & 0x003F));
case 4: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>12) & 0x003F));
case 3: h = (h>>28) ^ (h<<4) ^ (0x80 | ((U>>6 ) & 0x003F));
case 2: h = (h>>28) ^ (h<<4) ^ (0x80 | ( U & 0x003F));
default: code_length = 0;
break;
}
}
}
}
if ( resultingStrLen )
*resultingStrLen = (s-str)-1;
return h;
}
PRUint32 nsCRT::BufferHashCode(const PRUnichar* s, PRUint32 len)
{
PRUint32 h = 0;

View File

@ -228,6 +228,12 @@ public:
static PRUint32 HashCode(const PRUnichar* str,
PRUint32* resultingStrLen = nsnull);
// Computes a hashcode for a ucs2 string that returns the same thing
// as the HashCode method taking a |char*| would if the string were
// converted to UTF8. Returns the string length as an added bonus.
static PRUint32 HashCodeAsUTF8(const PRUnichar* str,
PRUint32* resultingStrLen = nsnull);
// Computes the hashcode for a buffer with a specified length.
static PRUint32 BufferHashCode(const PRUnichar* str, PRUint32 strLen);

View File

@ -364,4 +364,16 @@ NS_COM const nsAFlatString& EmptyString();
NS_COM const nsAFlatCString& EmptyCString();
/**
* Compare a UTF-8 string to an UTF-16 string.
*
* Returns 0 if the strings are equal, -1 if aUTF8String is less
* than aUTF16Count, and 1 in the reverse case. In case of fatal
* error (eg the strings are not valid UTF8 and UTF16 respectively),
* this method will return PR_INT32_MIN.
*/
NS_COM PRInt32
CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
const nsASingleFragmentString& aUTF16String);
#endif // !defined(nsReadableUtils_h___)

View File

@ -35,7 +35,6 @@
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsUTF8Utils_h_
#define nsUTF8Utils_h_
@ -60,6 +59,349 @@ class UTF8traits
#define NS_ALWAYS_INLINE
#endif
/**
* Extract the next UCS-4 character from the buffer and return it. The
* pointer passed in is advanced to the start of the next character in the
* buffer. If non-null, the parameters err and overlong are filled in to
* indicate that the character was represented by an overlong sequence, or
* that an error occurred.
*/
class UTF8CharEnumerator
{
public:
static PRUint32 NextChar(const char **buffer, const char *end,
PRBool *err = nsnull, PRBool* overlong = nsnull)
{
NS_ASSERTION(buffer && *buffer, "null buffer!");
const char *p = *buffer;
if (p >= end)
{
if (err)
*err = PR_TRUE;
return 0;
}
char c = *p++;
if ( UTF8traits::isASCII(c) )
{
if (err)
*err = PR_FALSE;
if (overlong)
*overlong = PR_FALSE;
*buffer = p;
return c;
}
PRUint32 ucs4;
PRUint32 minUcs4;
PRInt32 state = 0;
if (!CalcState(c, ucs4, minUcs4, state)) {
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
if (err)
*err = PR_TRUE;
return 0;
}
while ( state-- )
{
if (p == end)
{
if (err)
*err = PR_TRUE;
return 0;
}
c = *p++;
if (!AddByte(c, state, ucs4))
{
NS_ERROR("not a UTF8 string");
if (err)
*err = PR_TRUE;
return 0;
}
}
if (err)
*err = PR_FALSE;
if (overlong)
*overlong = ucs4 < minUcs4;
*buffer = p;
return ucs4;
}
static PRUint32 NextChar(nsACString::const_iterator& iter,
const nsACString::const_iterator& end,
PRBool *err = nsnull, PRBool *overlong = nsnull)
{
if ( iter == end )
{
NS_ERROR("No input to work with");
if (err)
*err = PR_TRUE;
return 0;
}
char c = *iter++;
if ( UTF8traits::isASCII(c) )
{
if (err)
*err = PR_FALSE;
if (overlong)
*overlong = PR_FALSE;
return c;
}
PRUint32 ucs4;
PRUint32 minUcs4;
PRInt32 state = 0;
if (!CalcState(c, ucs4, minUcs4, state)) {
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
if (err)
*err = PR_TRUE;
return 0;
}
while ( state-- )
{
if (iter == end)
{
NS_ERROR("Buffer ended in the middle of a multibyte sequence");
if (err)
*err = PR_TRUE;
return 0;
}
c = *iter++;
if (!AddByte(c, state, ucs4))
{
NS_ERROR("not a UTF8 string");
if (err)
*err = PR_TRUE;
return 0;
}
}
if (err)
*err = PR_FALSE;
if (overlong)
*overlong = ucs4 < minUcs4;
return ucs4;
}
private:
static PRBool CalcState(char c, PRUint32& ucs4, PRUint32& minUcs4,
PRInt32& state)
{
if ( UTF8traits::is2byte(c) )
{
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
state = 1;
minUcs4 = 0x00000080;
}
else if ( UTF8traits::is3byte(c) )
{
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
state = 2;
minUcs4 = 0x00000800;
}
else if ( UTF8traits::is4byte(c) )
{
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
state = 3;
minUcs4 = 0x00010000;
}
else if ( UTF8traits::is5byte(c) )
{
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
state = 4;
minUcs4 = 0x00200000;
}
else if ( UTF8traits::is6byte(c) )
{
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
state = 5;
minUcs4 = 0x04000000;
}
else
{
return PR_FALSE;
}
return PR_TRUE;
}
static PRBool AddByte(char c, PRInt32 state, PRUint32& ucs4)
{
if ( UTF8traits::isInSeq(c) )
{
PRInt32 shift = state * 6;
ucs4 |= (PRUint32(c) & 0x3F) << shift;
return PR_TRUE;
}
return PR_FALSE;
}
};
/**
* Extract the next UCS-4 character from the buffer and return it. The
* pointer passed in is advanced to the start of the next character in the
* buffer. If non-null, the err parameter is filled in if an error occurs.
*/
class UTF16CharEnumerator
{
public:
static PRUint32 NextChar(const PRUnichar **buffer, const PRUnichar *end,
PRBool *err = nsnull)
{
NS_ASSERTION(buffer && *buffer, "null buffer!");
const PRUnichar *p = *buffer;
if (p >= end)
{
NS_ERROR("No input to work with");
if (err)
*err = PR_TRUE;
return 0;
}
PRUnichar c = *p++;
if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
{
if (err)
*err = PR_FALSE;
*buffer = p;
return c;
}
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
{
if (*buffer == end)
{
NS_ERROR("Unexpected end of buffer after high surrogate");
if (err)
*err = PR_TRUE;
return 0;
}
// D800- DBFF - High Surrogate
// N = (H- D800) *400 + 10000 + ...
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
c = *p++;
if (0xDC00 == (0xFC00 & c))
{
// DC00- DFFF - Low Surrogate
// N += ( L - DC00 )
ucs4 |= (0x03FF & c);
if (err)
*err = PR_FALSE;
*buffer = p;
return ucs4;
}
else
{
NS_ERROR("got a High Surrogate but no low surrogate");
// output nothing.
}
}
else // U+DC00 - U+DFFF
{
// DC00- DFFF - Low Surrogate
NS_ERROR("got a low Surrogate but no high surrogate");
// output nothing.
}
if (err)
*err = PR_TRUE;
return 0;
}
static PRUint32 NextChar(nsAString::const_iterator& iter,
const nsAString::const_iterator& end,
PRBool *err = nsnull)
{
if (iter == end)
{
if (err)
*err = PR_TRUE;
return 0;
}
PRUnichar c = *iter++;
if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
{
if (err)
*err = PR_FALSE;
return c;
}
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
{
if (iter == end)
{
if (err)
*err = PR_TRUE;
return 0;
}
// D800- DBFF - High Surrogate
// N = (H- D800) *400 + 10000 + ...
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
c = *iter++;
if (0xDC00 == (0xFC00 & c))
{
// DC00- DFFF - Low Surrogate
// N += ( L - DC00 )
ucs4 |= (0x03FF & c);
if (err)
*err = PR_FALSE;
return ucs4;
}
else
{
NS_ERROR("got a High Surrogate but no low surrogate");
// output nothing.
}
}
else // U+DC00 - U+DFFF
{
// DC00- DFFF - Low Surrogate
NS_ERROR("got a low Surrogate but no high surrogate");
// output nothing.
}
if (err)
*err = PR_TRUE;
return 0;
}
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
* UTF-8 to UTF-16
@ -87,75 +429,18 @@ class ConvertUTF8toUTF16
buffer_type* out = mBuffer;
for ( ; p != end /* && *p */; )
{
char c = *p++;
PRBool overlong, err;
PRUint32 ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err,
&overlong);
if ( UTF8traits::isASCII(c) )
if ( err )
{
*out++ = buffer_type(c);
continue;
}
PRUint32 ucs4;
PRUint32 minUcs4;
PRInt32 state = 0;
if ( UTF8traits::is2byte(c) )
{
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
state = 1;
minUcs4 = 0x00000080;
}
else if ( UTF8traits::is3byte(c) )
{
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
state = 2;
minUcs4 = 0x00000800;
}
else if ( UTF8traits::is4byte(c) )
{
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
state = 3;
minUcs4 = 0x00010000;
}
else if ( UTF8traits::is5byte(c) )
{
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
state = 4;
minUcs4 = 0x00200000;
}
else if ( UTF8traits::is6byte(c) )
{
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
state = 5;
minUcs4 = 0x04000000;
}
else
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
mBuffer = out;
return N;
}
while ( state-- )
{
c = *p++;
if ( UTF8traits::isInSeq(c) )
{
PRInt32 shift = state * 6;
ucs4 |= (PRUint32(c) & 0x3F) << shift;
}
else
{
NS_ERROR("not a UTF8 string");
mErrorEncountered = PR_TRUE;
mBuffer = out;
return N;
}
}
if ( ucs4 < minUcs4 )
if ( overlong )
{
// Overlong sequence
*out++ = UCS2_REPLACEMENT_CHAR;

View File

@ -1081,16 +1081,88 @@ StringEndsWith( const nsACString& aSource, const nsACString& aSubstring,
static const PRUnichar empty_buffer[1] = { '\0' };
NS_COM const nsAFlatString& EmptyString()
NS_COM
const nsAFlatString&
EmptyString()
{
static const nsDependentString sEmpty(empty_buffer);
return sEmpty;
}
NS_COM const nsAFlatCString& EmptyCString()
NS_COM
const nsAFlatCString&
EmptyCString()
{
static const nsDependentCString sEmpty((const char *)empty_buffer);
return sEmpty;
}
NS_COM PRInt32
CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
const nsASingleFragmentString& aUTF16String)
{
static const PRUint32 NOT_ASCII = PRUint32(~0x7F);
const char *u8, *u8end;
aUTF8String.BeginReading(u8);
aUTF8String.EndReading(u8end);
const PRUnichar *u16, *u16end;
aUTF16String.BeginReading(u16);
aUTF16String.EndReading(u16end);
while (u8 != u8end && u16 != u16end)
{
// Cast away the signedness of *u8 to prevent signextension when
// converting to PRUint32
PRUint32 c8_32 = (PRUint8)*u8;
if (c8_32 & NOT_ASCII)
{
PRBool err;
c8_32 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
if (err)
return PR_INT32_MIN;
PRUint32 c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end,
&err);
if (err)
return PR_INT32_MIN;
if (c8_32 != c16_32)
return c8_32 < c16_32 ? -1 : 1;
}
else
{
if (c8_32 != *u16)
return c8_32 > *u16 ? 1 : -1;
++u8;
++u16;
}
}
if (u8 != u8end)
{
// We get to the end of the UTF16 string, but no to the end of
// the UTF8 string. The UTF8 string is longer than the UTF16
// string
return 1;
}
if (u16 != u16end)
{
// We get to the end of the UTF8 string, but no to the end of
// the UTF16 string. The UTF16 string is longer than the UTF8
// string
return -1;
}
// The two strings match.
return 0;
}