Move the core of NS_ConvertUCS2toUTF8 into character sinks in nsUTF8Utils.h, and use them to make ToNewUTF8String faster. Fix bug in surrogate handling in the moved code. Make various tweaks to improve performance of conversion between UCS2 and UTF-8 (both ways). b=206682 r=jag sr=jst

This commit is contained in:
dbaron%dbaron.org 2003-06-11 04:27:13 +00:00
parent f88fdaee47
commit 18b8c334fb
12 changed files with 652 additions and 346 deletions

View File

@ -45,6 +45,7 @@
#include "nsString.h"
#include "nsReadableUtils.h"
#include "nsDebug.h"
#include "nsUTF8Utils.h"
#ifndef nsCharTraits_h___
#include "nsCharTraits.h"
@ -1086,111 +1087,86 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P
//----------------------------------------------------------------------
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
{
nsAString::const_iterator start; aString.BeginReading(start);
nsAString::const_iterator end; aString.EndReading(end);
while (start != end) {
nsReadableFragment<PRUnichar> frag(start.fragment());
Append(frag.mStart, frag.mEnd - frag.mStart);
start.advance(start.size_forward());
}
if (!aString)
// Leave us as an uninitialized nsCAutoString.
return;
Init(aString, nsCharTraits<PRUnichar>::length(aString));
}
void
NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
{
// Handle null string by just leaving us as a brand-new
// uninitialized nsCAutoString.
if (! aString)
if (!aString)
// Leave us as an uninitialized nsCAutoString.
return;
Init(aString, aLength);
}
// Calculate how many bytes we need
const PRUnichar* p;
PRInt32 count, utf8len;
for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString )
{
nsASingleFragmentString::const_char_iterator start;
Init(aString.BeginReading(start), aString.Length());
}
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
nsAString::const_iterator start, end;
CalculateUTF8Size calculator;
copy_string(aString.BeginReading(start), aString.EndReading(end),
calculator);
PRUint32 count = calculator.Size();
if (count)
{
if (! ((*p) & 0xFF80))
utf8len += 1; // 0000 0000 - 0000 007F
else if (! ((*p) & 0xF800))
utf8len += 2; // 0000 0080 - 0000 07FF
else
utf8len += 3; // 0000 0800 - 0000 FFFF
// Note: Surrogate pair needs 4 bytes, but in this calcuation
// we count it as 6 bytes. It will waste 2 bytes per surrogate pair
}
// Grow the buffer if we need to.
SetCapacity(count);
// Make sure our buffer's big enough, so we don't need to do
// multiple allocations.
if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
SetCapacity(mLength+utf8len+1);
// |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
// we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
// All ready? Time to convert
char* out = mStr+mLength;
PRUint32 ucs4=0;
for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
{
if (0 == ucs4)
ConvertUCS2toUTF8 converter(mStr);
copy_string(aString.BeginReading(start), aString.EndReading(end),
converter).write_terminator();
mLength = converter.Size();
if (mLength != count)
{
if (! ((*p) & 0xFF80))
{
*out++ = (char)*p;
}
else if (! ((*p) & 0xF800))
{
*out++ = 0xC0 | (char)((*p) >> 6);
*out++ = 0x80 | (char)(0x003F & (*p));
}
else
{
if (0xD800 == (0xFC00 & (*p)))
{
// D800- DBFF - High Surrogate
// N = (H- D800) *400 + 10000 + ...
ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
}
else if (0xDC00 == (0xFC00 & (*p)))
{
// DC00- DFFF - Low Surrogate
// error here. We should hit High Surrogate first
// Do not output any thing in this case
}
else
{
*out++ = 0xE0 | (char)((*p) >> 12);
*out++ = 0x80 | (char)(0x003F & (*p >> 6));
*out++ = 0x80 | (char)(0x003F & (*p) );
}
}
}
else
{
if (0xDC00 == (0xFC00 & (*p)))
{
// DC00- DFFF - Low Surrogate
// N += ( L - DC00 )
ucs4 |= (0x03FF & (*p));
// 0001 0000-001F FFFF
*out++ = 0xF0 | (char)(ucs4 >> 18);
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
*out++ = 0x80 | (char)(0x003F & ucs4) ;
}
else
{
// Got a High Surrogate but no low surrogate
// output nothing.
}
ucs4 = 0;
NS_ERROR("Input invalid or incorrect length was calculated");
Truncate();
}
}
}
*out = '\0'; // null terminate
mLength += utf8len;
void NS_ConvertUCS2toUTF8::Init( const PRUnichar* aString, PRUint32 aLength )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
CalculateUTF8Size calculator;
calculator.write(aString, aLength);
PRUint32 count = calculator.Size();
if (count)
{
// Grow the buffer if we need to.
SetCapacity(count);
// All ready? Time to convert
ConvertUCS2toUTF8 converter(mStr);
converter.write(aString, aLength);
mLength = converter.Size();
mStr[mLength] = char_type(0);
if (mLength != count)
{
NS_ERROR("Input invalid or incorrect length was calculated");
Truncate();
}
}
}
NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )

View File

@ -431,24 +431,13 @@ class NS_COM NS_ConvertUCS2toUTF8
*/
{
public:
friend NS_COM char* ToNewUTF8String( const nsAString& aSource );
public:
explicit
NS_ConvertUCS2toUTF8( const PRUnichar* aString )
{
Append( aString, ~PRUint32(0) /* MAXINT */);
}
NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
{
Append( aString, aLength );
}
explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString );
NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength );
explicit NS_ConvertUCS2toUTF8( const nsAString& aString );
explicit NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString );
protected:
void Append( const PRUnichar* aString, PRUint32 aLength );
void Init( const PRUnichar* aString, PRUint32 aLength );
private:
// NOT TO BE IMPLEMENTED

View File

@ -1351,34 +1351,82 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString )
}
}
void
NS_ConvertUTF8toUCS2::Init( const nsACString& aCString )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsACString& aCString )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
nsACString::const_iterator start, end;
CalculateUTF8Length calculator;
copy_string(aCString.BeginReading(start), aCString.EndReading(end), calculator);
nsACString::const_iterator start, end;
CalculateUTF8Length calculator;
copy_string(aCString.BeginReading(start), aCString.EndReading(end),
calculator);
PRUint32 count = calculator.Length();
PRUint32 count = calculator.Length();
if (count) {
// Grow the buffer if we need to.
SetLength(count);
if (count)
{
// Grow the buffer if we need to.
SetCapacity(count);
// All ready? Time to convert
// All ready? Time to convert
ConvertUTF8toUCS2 converter(mUStr);
copy_string(aCString.BeginReading(start), aCString.EndReading(end), converter);
mLength = converter.Length();
if (mLength != count) {
NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
Truncate();
}
ConvertUTF8toUCS2 converter(mUStr);
copy_string(aCString.BeginReading(start), aCString.EndReading(end),
converter).write_terminator();
mLength = converter.Length();
if (mLength != count)
{
NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
Truncate();
}
}
}
}
NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString )
{
nsASingleFragmentCString::const_char_iterator start;
Init(aCString.BeginReading(start), aCString.Length());
}
NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString )
{
Init(aCString, nsCharTraits<char>::length(aCString));
}
NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
{
Init(aCString, aLength);
}
void
NS_ConvertUTF8toUCS2::Init( const char* aCString, PRUint32 aLength )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
CalculateUTF8Length calculator;
calculator.write(aCString, aLength);
PRUint32 count = calculator.Length();
if (count)
{
// Grow the buffer if we need to.
SetCapacity(count);
// All ready? Time to convert
ConvertUTF8toUCS2 converter(mUStr);
converter.write(aCString, aLength);
mLength = converter.Length();
mUStr[mLength] = char_type(0);
if (mLength != count)
{
NS_ERROR("Input invalid or incorrect length was calculated");
Truncate();
}
}
}
/**
* Default copy constructor

View File

@ -530,25 +530,13 @@ class NS_COM NS_ConvertUTF8toUCS2
: public nsAutoString
{
public:
explicit
NS_ConvertUTF8toUCS2( const nsACString& aCString )
{
Init( aCString );
}
explicit
NS_ConvertUTF8toUCS2( const char* aCString )
{
Init( nsDependentCString( aCString ) );
}
NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
{
Init( Substring( aCString, aCString + aLength ) );
}
explicit NS_ConvertUTF8toUCS2( const nsACString& aCString );
explicit NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString );
explicit NS_ConvertUTF8toUCS2( const char* aCString );
NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength );
protected:
void Init( const nsACString& aCString );
void Init( const char* aCString, PRUint32 aLength );
private:
NS_ConvertUTF8toUCS2( PRUnichar );

View File

@ -54,6 +54,12 @@ class UTF8traits
#define PLANE1_BASE 0x00010000
#define UCS2_REPLACEMENT_CHAR 0xfffd
#ifdef __GNUC__
#define NS_ALWAYS_INLINE __attribute__((always_inline))
#else
#define NS_ALWAYS_INLINE
#endif
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
* UTF-8 to UCS2 (really UTF-16).
@ -69,7 +75,7 @@ class ConvertUTF8toUCS2
size_t Length() const { return mBuffer - mStart; }
PRUint32 write( const value_type* start, PRUint32 N )
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
if ( mErrorEncountered )
return N;
@ -78,13 +84,14 @@ class ConvertUTF8toUCS2
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
buffer_type* out = mBuffer;
for ( ; p != end /* && *p */; )
{
char c = *p++;
if ( UTF8traits::isASCII(c) )
{
*mBuffer++ = buffer_type(c);
*out++ = buffer_type(c);
continue;
}
@ -126,6 +133,7 @@ class ConvertUTF8toUCS2
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
mBuffer = out;
return N;
}
@ -142,6 +150,7 @@ class ConvertUTF8toUCS2
{
NS_ERROR("not a UTF8 string");
mErrorEncountered = PR_TRUE;
mBuffer = out;
return N;
}
}
@ -149,39 +158,40 @@ class ConvertUTF8toUCS2
if ( ucs4 < minUcs4 )
{
// Overlong sequence
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
*out++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 <= 0xD7FF )
{
*mBuffer++ = ucs4;
*out++ = ucs4;
}
else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
{
// Surrogates
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
*out++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
{
// Prohibited characters
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
*out++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 >= PLANE1_BASE )
{
if ( ucs4 >= 0x00110000 )
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
*out++ = UCS2_REPLACEMENT_CHAR;
else {
// surrogate, see unicode specification 3.7 for following math.
ucs4 -= PLANE1_BASE;
*mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
*out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
}
}
else
{
if ( ucs4 != 0xFEFF ) // ignore BOM
*mBuffer++ = ucs4;
*out++ = ucs4;
}
}
mBuffer = out;
return p - start;
}
@ -191,7 +201,7 @@ class ConvertUTF8toUCS2
}
private:
buffer_type* mStart;
buffer_type* const mStart;
buffer_type* mBuffer;
PRBool mErrorEncountered;
};
@ -209,7 +219,7 @@ class CalculateUTF8Length
size_t Length() const { return mLength; }
PRUint32 write( const value_type* start, PRUint32 N )
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
// ignore any further requests
if ( mErrorEncountered )
@ -255,4 +265,152 @@ class CalculateUTF8Length
PRBool mErrorEncountered;
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
* UCS2 (really UTF-16) to UTF-8.
*/
class ConvertUCS2toUTF8
{
public:
typedef nsAString::char_type value_type;
typedef nsACString::char_type buffer_type;
// The error handling here is more lenient than that in
// |ConvertUTF8toUCS2|, but it's that way for backwards
// compatibility.
ConvertUCS2toUTF8( buffer_type* aBuffer )
: mStart(aBuffer), mBuffer(aBuffer) {}
size_t Size() const { return mBuffer - mStart; }
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
for (const value_type *p = start, *end = start + N; p < end; ++p )
{
value_type c = *p;
if (! (c & 0xFF80)) // U+0000 - U+007F
{
*out++ = (char)c;
}
else if (! (c & 0xF800)) // U+0100 - U+07FF
{
*out++ = 0xC0 | (char)(c >> 6);
*out++ = 0x80 | (char)(0x003F & c);
}
else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
{
*out++ = 0xE0 | (char)(c >> 12);
*out++ = 0x80 | (char)(0x003F & (c >> 6));
*out++ = 0x80 | (char)(0x003F & c );
}
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
{
// D800- DBFF - High Surrogate
// N = (H- D800) *400 + 10000 + ...
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
++p;
if (p == end)
{
NS_ERROR("Surrogate pair split between fragments");
mBuffer = out;
return N;
}
c = *p;
if (0xDC00 == (0xFC00 & c))
{
// DC00- DFFF - Low Surrogate
// N += ( L - DC00 )
ucs4 |= (0x03FF & c);
// 0001 0000-001F FFFF
*out++ = 0xF0 | (char)(ucs4 >> 18);
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
*out++ = 0x80 | (char)(0x003F & ucs4);
}
else
{
NS_ERROR("got a High Surrogate but no low surrogate");
// output nothing.
}
}
else // U+DC00 - U+DFFF
{
// DC00- DFFF - Low Surrogate
NS_ERROR("got a low Surrogate but no high surrogate");
// output nothing.
}
}
mBuffer = out;
return N;
}
void write_terminator()
{
*mBuffer = buffer_type(0);
}
private:
buffer_type* const mStart;
buffer_type* mBuffer;
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
* the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8.
*/
class CalculateUTF8Size
{
public:
typedef nsAString::char_type value_type;
CalculateUTF8Size()
: mSize(0) { }
size_t Size() const { return mSize; }
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
// Assume UCS2 surrogate pairs won't be spread across fragments.
for (const value_type *p = start, *end = start + N; p < end; ++p )
{
value_type c = *p;
if (! (c & 0xFF80)) // U+0000 - U+007F
mSize += 1;
else if (! (c & 0xF800)) // U+0100 - U+07FF
mSize += 2;
else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
mSize += 3;
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
{
++p;
if (p == end)
{
NS_ERROR("Surrogate pair split between fragments");
return N;
}
c = *p;
if (0xDC00 == (0xFC00 & c))
mSize += 4;
else
NS_ERROR("got a high Surrogate but no low surrogate");
}
else // U+DC00 - U+DFFF
NS_ERROR("got a low Surrogate but no high surrogate");
}
return N;
}
private:
size_t mSize;
};
#endif /* !defined(nsUTF8Utils_h_) */

View File

@ -245,24 +245,18 @@ NS_COM
char*
ToNewUTF8String( const nsAString& aSource )
{
// XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
// refactored so that we can use it here without a double-copy.
NS_ConvertUCS2toUTF8 temp(aSource);
nsAString::const_iterator start, end;
CalculateUTF8Size calculator;
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
calculator);
char* result;
if (temp.GetOwnsBuffer()) {
// We allocated. Trick the string into not freeing its buffer to
// avoid an extra allocation.
result = temp.mStr;
char *result = NS_STATIC_CAST(char*,
nsMemory::Alloc(calculator.Size() + 1));
temp.mStr=0;
temp.SetOwnsBuffer(PR_FALSE);
}
else {
// We didn't allocate a buffer, so we need to copy it out of the
// nsCAutoString's storage.
result = ToNewCString(temp);
}
ConvertUCS2toUTF8 converter(result);
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
converter).write_terminator();
NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");
return result;
}

View File

@ -45,6 +45,7 @@
#include "nsString.h"
#include "nsReadableUtils.h"
#include "nsDebug.h"
#include "nsUTF8Utils.h"
#ifndef nsCharTraits_h___
#include "nsCharTraits.h"
@ -1086,111 +1087,86 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P
//----------------------------------------------------------------------
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
{
nsAString::const_iterator start; aString.BeginReading(start);
nsAString::const_iterator end; aString.EndReading(end);
while (start != end) {
nsReadableFragment<PRUnichar> frag(start.fragment());
Append(frag.mStart, frag.mEnd - frag.mStart);
start.advance(start.size_forward());
}
if (!aString)
// Leave us as an uninitialized nsCAutoString.
return;
Init(aString, nsCharTraits<PRUnichar>::length(aString));
}
void
NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
{
// Handle null string by just leaving us as a brand-new
// uninitialized nsCAutoString.
if (! aString)
if (!aString)
// Leave us as an uninitialized nsCAutoString.
return;
Init(aString, aLength);
}
// Calculate how many bytes we need
const PRUnichar* p;
PRInt32 count, utf8len;
for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString )
{
nsASingleFragmentString::const_char_iterator start;
Init(aString.BeginReading(start), aString.Length());
}
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
nsAString::const_iterator start, end;
CalculateUTF8Size calculator;
copy_string(aString.BeginReading(start), aString.EndReading(end),
calculator);
PRUint32 count = calculator.Size();
if (count)
{
if (! ((*p) & 0xFF80))
utf8len += 1; // 0000 0000 - 0000 007F
else if (! ((*p) & 0xF800))
utf8len += 2; // 0000 0080 - 0000 07FF
else
utf8len += 3; // 0000 0800 - 0000 FFFF
// Note: Surrogate pair needs 4 bytes, but in this calcuation
// we count it as 6 bytes. It will waste 2 bytes per surrogate pair
}
// Grow the buffer if we need to.
SetCapacity(count);
// Make sure our buffer's big enough, so we don't need to do
// multiple allocations.
if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
SetCapacity(mLength+utf8len+1);
// |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
// we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
// All ready? Time to convert
char* out = mStr+mLength;
PRUint32 ucs4=0;
for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
{
if (0 == ucs4)
ConvertUCS2toUTF8 converter(mStr);
copy_string(aString.BeginReading(start), aString.EndReading(end),
converter).write_terminator();
mLength = converter.Size();
if (mLength != count)
{
if (! ((*p) & 0xFF80))
{
*out++ = (char)*p;
}
else if (! ((*p) & 0xF800))
{
*out++ = 0xC0 | (char)((*p) >> 6);
*out++ = 0x80 | (char)(0x003F & (*p));
}
else
{
if (0xD800 == (0xFC00 & (*p)))
{
// D800- DBFF - High Surrogate
// N = (H- D800) *400 + 10000 + ...
ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
}
else if (0xDC00 == (0xFC00 & (*p)))
{
// DC00- DFFF - Low Surrogate
// error here. We should hit High Surrogate first
// Do not output any thing in this case
}
else
{
*out++ = 0xE0 | (char)((*p) >> 12);
*out++ = 0x80 | (char)(0x003F & (*p >> 6));
*out++ = 0x80 | (char)(0x003F & (*p) );
}
}
}
else
{
if (0xDC00 == (0xFC00 & (*p)))
{
// DC00- DFFF - Low Surrogate
// N += ( L - DC00 )
ucs4 |= (0x03FF & (*p));
// 0001 0000-001F FFFF
*out++ = 0xF0 | (char)(ucs4 >> 18);
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
*out++ = 0x80 | (char)(0x003F & ucs4) ;
}
else
{
// Got a High Surrogate but no low surrogate
// output nothing.
}
ucs4 = 0;
NS_ERROR("Input invalid or incorrect length was calculated");
Truncate();
}
}
}
*out = '\0'; // null terminate
mLength += utf8len;
void NS_ConvertUCS2toUTF8::Init( const PRUnichar* aString, PRUint32 aLength )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
CalculateUTF8Size calculator;
calculator.write(aString, aLength);
PRUint32 count = calculator.Size();
if (count)
{
// Grow the buffer if we need to.
SetCapacity(count);
// All ready? Time to convert
ConvertUCS2toUTF8 converter(mStr);
converter.write(aString, aLength);
mLength = converter.Size();
mStr[mLength] = char_type(0);
if (mLength != count)
{
NS_ERROR("Input invalid or incorrect length was calculated");
Truncate();
}
}
}
NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )

View File

@ -431,24 +431,13 @@ class NS_COM NS_ConvertUCS2toUTF8
*/
{
public:
friend NS_COM char* ToNewUTF8String( const nsAString& aSource );
public:
explicit
NS_ConvertUCS2toUTF8( const PRUnichar* aString )
{
Append( aString, ~PRUint32(0) /* MAXINT */);
}
NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
{
Append( aString, aLength );
}
explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString );
NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength );
explicit NS_ConvertUCS2toUTF8( const nsAString& aString );
explicit NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString );
protected:
void Append( const PRUnichar* aString, PRUint32 aLength );
void Init( const PRUnichar* aString, PRUint32 aLength );
private:
// NOT TO BE IMPLEMENTED

View File

@ -1351,34 +1351,82 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString )
}
}
void
NS_ConvertUTF8toUCS2::Init( const nsACString& aCString )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsACString& aCString )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
nsACString::const_iterator start, end;
CalculateUTF8Length calculator;
copy_string(aCString.BeginReading(start), aCString.EndReading(end), calculator);
nsACString::const_iterator start, end;
CalculateUTF8Length calculator;
copy_string(aCString.BeginReading(start), aCString.EndReading(end),
calculator);
PRUint32 count = calculator.Length();
PRUint32 count = calculator.Length();
if (count) {
// Grow the buffer if we need to.
SetLength(count);
if (count)
{
// Grow the buffer if we need to.
SetCapacity(count);
// All ready? Time to convert
// All ready? Time to convert
ConvertUTF8toUCS2 converter(mUStr);
copy_string(aCString.BeginReading(start), aCString.EndReading(end), converter);
mLength = converter.Length();
if (mLength != count) {
NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
Truncate();
}
ConvertUTF8toUCS2 converter(mUStr);
copy_string(aCString.BeginReading(start), aCString.EndReading(end),
converter).write_terminator();
mLength = converter.Length();
if (mLength != count)
{
NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
Truncate();
}
}
}
}
NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString )
{
nsASingleFragmentCString::const_char_iterator start;
Init(aCString.BeginReading(start), aCString.Length());
}
NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString )
{
Init(aCString, nsCharTraits<char>::length(aCString));
}
NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
{
Init(aCString, aLength);
}
void
NS_ConvertUTF8toUCS2::Init( const char* aCString, PRUint32 aLength )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
CalculateUTF8Length calculator;
calculator.write(aCString, aLength);
PRUint32 count = calculator.Length();
if (count)
{
// Grow the buffer if we need to.
SetCapacity(count);
// All ready? Time to convert
ConvertUTF8toUCS2 converter(mUStr);
converter.write(aCString, aLength);
mLength = converter.Length();
mUStr[mLength] = char_type(0);
if (mLength != count)
{
NS_ERROR("Input invalid or incorrect length was calculated");
Truncate();
}
}
}
/**
* Default copy constructor

View File

@ -530,25 +530,13 @@ class NS_COM NS_ConvertUTF8toUCS2
: public nsAutoString
{
public:
explicit
NS_ConvertUTF8toUCS2( const nsACString& aCString )
{
Init( aCString );
}
explicit
NS_ConvertUTF8toUCS2( const char* aCString )
{
Init( nsDependentCString( aCString ) );
}
NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
{
Init( Substring( aCString, aCString + aLength ) );
}
explicit NS_ConvertUTF8toUCS2( const nsACString& aCString );
explicit NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString );
explicit NS_ConvertUTF8toUCS2( const char* aCString );
NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength );
protected:
void Init( const nsACString& aCString );
void Init( const char* aCString, PRUint32 aLength );
private:
NS_ConvertUTF8toUCS2( PRUnichar );

View File

@ -54,6 +54,12 @@ class UTF8traits
#define PLANE1_BASE 0x00010000
#define UCS2_REPLACEMENT_CHAR 0xfffd
#ifdef __GNUC__
#define NS_ALWAYS_INLINE __attribute__((always_inline))
#else
#define NS_ALWAYS_INLINE
#endif
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
* UTF-8 to UCS2 (really UTF-16).
@ -69,7 +75,7 @@ class ConvertUTF8toUCS2
size_t Length() const { return mBuffer - mStart; }
PRUint32 write( const value_type* start, PRUint32 N )
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
if ( mErrorEncountered )
return N;
@ -78,13 +84,14 @@ class ConvertUTF8toUCS2
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
buffer_type* out = mBuffer;
for ( ; p != end /* && *p */; )
{
char c = *p++;
if ( UTF8traits::isASCII(c) )
{
*mBuffer++ = buffer_type(c);
*out++ = buffer_type(c);
continue;
}
@ -126,6 +133,7 @@ class ConvertUTF8toUCS2
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
mBuffer = out;
return N;
}
@ -142,6 +150,7 @@ class ConvertUTF8toUCS2
{
NS_ERROR("not a UTF8 string");
mErrorEncountered = PR_TRUE;
mBuffer = out;
return N;
}
}
@ -149,39 +158,40 @@ class ConvertUTF8toUCS2
if ( ucs4 < minUcs4 )
{
// Overlong sequence
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
*out++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 <= 0xD7FF )
{
*mBuffer++ = ucs4;
*out++ = ucs4;
}
else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
{
// Surrogates
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
*out++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
{
// Prohibited characters
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
*out++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 >= PLANE1_BASE )
{
if ( ucs4 >= 0x00110000 )
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
*out++ = UCS2_REPLACEMENT_CHAR;
else {
// surrogate, see unicode specification 3.7 for following math.
ucs4 -= PLANE1_BASE;
*mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
*out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
}
}
else
{
if ( ucs4 != 0xFEFF ) // ignore BOM
*mBuffer++ = ucs4;
*out++ = ucs4;
}
}
mBuffer = out;
return p - start;
}
@ -191,7 +201,7 @@ class ConvertUTF8toUCS2
}
private:
buffer_type* mStart;
buffer_type* const mStart;
buffer_type* mBuffer;
PRBool mErrorEncountered;
};
@ -209,7 +219,7 @@ class CalculateUTF8Length
size_t Length() const { return mLength; }
PRUint32 write( const value_type* start, PRUint32 N )
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
// ignore any further requests
if ( mErrorEncountered )
@ -255,4 +265,152 @@ class CalculateUTF8Length
PRBool mErrorEncountered;
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
* UCS2 (really UTF-16) to UTF-8.
*/
class ConvertUCS2toUTF8
{
public:
typedef nsAString::char_type value_type;
typedef nsACString::char_type buffer_type;
// The error handling here is more lenient than that in
// |ConvertUTF8toUCS2|, but it's that way for backwards
// compatibility.
ConvertUCS2toUTF8( buffer_type* aBuffer )
: mStart(aBuffer), mBuffer(aBuffer) {}
size_t Size() const { return mBuffer - mStart; }
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
for (const value_type *p = start, *end = start + N; p < end; ++p )
{
value_type c = *p;
if (! (c & 0xFF80)) // U+0000 - U+007F
{
*out++ = (char)c;
}
else if (! (c & 0xF800)) // U+0100 - U+07FF
{
*out++ = 0xC0 | (char)(c >> 6);
*out++ = 0x80 | (char)(0x003F & c);
}
else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
{
*out++ = 0xE0 | (char)(c >> 12);
*out++ = 0x80 | (char)(0x003F & (c >> 6));
*out++ = 0x80 | (char)(0x003F & c );
}
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
{
// D800- DBFF - High Surrogate
// N = (H- D800) *400 + 10000 + ...
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
++p;
if (p == end)
{
NS_ERROR("Surrogate pair split between fragments");
mBuffer = out;
return N;
}
c = *p;
if (0xDC00 == (0xFC00 & c))
{
// DC00- DFFF - Low Surrogate
// N += ( L - DC00 )
ucs4 |= (0x03FF & c);
// 0001 0000-001F FFFF
*out++ = 0xF0 | (char)(ucs4 >> 18);
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
*out++ = 0x80 | (char)(0x003F & ucs4);
}
else
{
NS_ERROR("got a High Surrogate but no low surrogate");
// output nothing.
}
}
else // U+DC00 - U+DFFF
{
// DC00- DFFF - Low Surrogate
NS_ERROR("got a low Surrogate but no high surrogate");
// output nothing.
}
}
mBuffer = out;
return N;
}
void write_terminator()
{
*mBuffer = buffer_type(0);
}
private:
buffer_type* const mStart;
buffer_type* mBuffer;
};
/**
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
* the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8.
*/
class CalculateUTF8Size
{
public:
typedef nsAString::char_type value_type;
CalculateUTF8Size()
: mSize(0) { }
size_t Size() const { return mSize; }
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
// Assume UCS2 surrogate pairs won't be spread across fragments.
for (const value_type *p = start, *end = start + N; p < end; ++p )
{
value_type c = *p;
if (! (c & 0xFF80)) // U+0000 - U+007F
mSize += 1;
else if (! (c & 0xF800)) // U+0100 - U+07FF
mSize += 2;
else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
mSize += 3;
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
{
++p;
if (p == end)
{
NS_ERROR("Surrogate pair split between fragments");
return N;
}
c = *p;
if (0xDC00 == (0xFC00 & c))
mSize += 4;
else
NS_ERROR("got a high Surrogate but no low surrogate");
}
else // U+DC00 - U+DFFF
NS_ERROR("got a low Surrogate but no high surrogate");
}
return N;
}
private:
size_t mSize;
};
#endif /* !defined(nsUTF8Utils_h_) */

View File

@ -245,24 +245,18 @@ NS_COM
char*
ToNewUTF8String( const nsAString& aSource )
{
// XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
// refactored so that we can use it here without a double-copy.
NS_ConvertUCS2toUTF8 temp(aSource);
nsAString::const_iterator start, end;
CalculateUTF8Size calculator;
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
calculator);
char* result;
if (temp.GetOwnsBuffer()) {
// We allocated. Trick the string into not freeing its buffer to
// avoid an extra allocation.
result = temp.mStr;
char *result = NS_STATIC_CAST(char*,
nsMemory::Alloc(calculator.Size() + 1));
temp.mStr=0;
temp.SetOwnsBuffer(PR_FALSE);
}
else {
// We didn't allocate a buffer, so we need to copy it out of the
// nsCAutoString's storage.
result = ToNewCString(temp);
}
ConvertUCS2toUTF8 converter(result);
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
converter).write_terminator();
NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");
return result;
}