mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-01-26 14:46:02 +00:00
Move the core of NS_ConvertUCS2toUTF8 into character sinks in nsUTF8Utils.h, and use them to make ToNewUTF8String faster. Fix bug in surrogate handling in the moved code. Fix null-termination bug in UTF8ToNewUnicode. b=206682 r=jag sr=alecf a=brendan
This commit is contained in:
parent
f6ad24ed15
commit
06133b6d3c
@ -45,6 +45,7 @@
|
||||
#include "nsString.h"
|
||||
#include "nsReadableUtils.h"
|
||||
#include "nsDebug.h"
|
||||
#include "nsUTF8Utils.h"
|
||||
|
||||
#ifndef nsCharTraits_h___
|
||||
#include "nsCharTraits.h"
|
||||
@ -54,8 +55,10 @@
|
||||
#include "prdtoa.h"
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
static const char* kPossibleNull = "Error: possible unintended null in string";
|
||||
static const char* kNullPointerError = "Error: unexpected null ptr";
|
||||
#endif
|
||||
static const char* kWhitespace="\b\t\r\n ";
|
||||
|
||||
const nsBufferHandle<char>*
|
||||
@ -1084,111 +1087,47 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
|
||||
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
|
||||
{
|
||||
nsAString::const_iterator start; aString.BeginReading(start);
|
||||
nsAString::const_iterator end; aString.EndReading(end);
|
||||
|
||||
while (start != end) {
|
||||
nsReadableFragment<PRUnichar> frag(start.fragment());
|
||||
Append(frag.mStart, frag.mEnd - frag.mStart);
|
||||
start.advance(start.size_forward());
|
||||
}
|
||||
if (!aString)
|
||||
// Leave us as an uninitialized nsCAutoString.
|
||||
return;
|
||||
Init(nsDependentString(aString));
|
||||
}
|
||||
|
||||
void
|
||||
NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
|
||||
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
|
||||
{
|
||||
// Handle null string by just leaving us as a brand-new
|
||||
// uninitialized nsCAutoString.
|
||||
if (! aString)
|
||||
if (!aString)
|
||||
// Leave us as an uninitialized nsCAutoString.
|
||||
return;
|
||||
Init(Substring(aString, aString + aLength));
|
||||
}
|
||||
|
||||
// Calculate how many bytes we need
|
||||
const PRUnichar* p;
|
||||
PRInt32 count, utf8len;
|
||||
for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
|
||||
{
|
||||
if (! ((*p) & 0xFF80))
|
||||
utf8len += 1; // 0000 0000 - 0000 007F
|
||||
else if (! ((*p) & 0xF800))
|
||||
utf8len += 2; // 0000 0080 - 0000 07FF
|
||||
else
|
||||
utf8len += 3; // 0000 0800 - 0000 FFFF
|
||||
// Note: Surrogate pair needs 4 bytes, but in this calcuation
|
||||
// we count it as 6 bytes. It will waste 2 bytes per surrogate pair
|
||||
void NS_ConvertUCS2toUTF8::Init( const nsAString& aString )
|
||||
{
|
||||
// Compute space required: do this once so we don't incur multiple
|
||||
// allocations. This "optimization" is probably of dubious value...
|
||||
|
||||
nsAString::const_iterator start, end;
|
||||
CalculateUTF8Size calculator;
|
||||
copy_string(aString.BeginReading(start), aString.EndReading(end), calculator);
|
||||
|
||||
PRUint32 count = calculator.Size();
|
||||
|
||||
if (count) {
|
||||
// Grow the buffer if we need to.
|
||||
SetLength(count);
|
||||
|
||||
// All ready? Time to convert
|
||||
|
||||
ConvertUCS2toUTF8 converter(mStr);
|
||||
copy_string(aString.BeginReading(start), aString.EndReading(end), converter);
|
||||
mLength = converter.Size();
|
||||
if (mLength != count) {
|
||||
NS_ERROR("Input invalid or incorrect length was calculated");
|
||||
Truncate();
|
||||
}
|
||||
|
||||
// Make sure our buffer's big enough, so we don't need to do
|
||||
// multiple allocations.
|
||||
if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
|
||||
SetCapacity(mLength+utf8len+1);
|
||||
// |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
|
||||
// we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
|
||||
|
||||
char* out = mStr+mLength;
|
||||
PRUint32 ucs4=0;
|
||||
|
||||
for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
|
||||
{
|
||||
if (0 == ucs4)
|
||||
{
|
||||
if (! ((*p) & 0xFF80))
|
||||
{
|
||||
*out++ = (char)*p;
|
||||
}
|
||||
else if (! ((*p) & 0xF800))
|
||||
{
|
||||
*out++ = 0xC0 | (char)((*p) >> 6);
|
||||
*out++ = 0x80 | (char)(0x003F & (*p));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (0xD800 == (0xFC00 & (*p)))
|
||||
{
|
||||
// D800- DBFF - High Surrogate
|
||||
// N = (H- D800) *400 + 10000 + ...
|
||||
ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
|
||||
}
|
||||
else if (0xDC00 == (0xFC00 & (*p)))
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
// error here. We should hit High Surrogate first
|
||||
// Do not output any thing in this case
|
||||
}
|
||||
else
|
||||
{
|
||||
*out++ = 0xE0 | (char)((*p) >> 12);
|
||||
*out++ = 0x80 | (char)(0x003F & (*p >> 6));
|
||||
*out++ = 0x80 | (char)(0x003F & (*p) );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (0xDC00 == (0xFC00 & (*p)))
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
// N += ( L - DC00 )
|
||||
ucs4 |= (0x03FF & (*p));
|
||||
|
||||
// 0001 0000-001F FFFF
|
||||
*out++ = 0xF0 | (char)(ucs4 >> 18);
|
||||
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
|
||||
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
|
||||
*out++ = 0x80 | (char)(0x003F & ucs4) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Got a High Surrogate but no low surrogate
|
||||
// output nothing.
|
||||
}
|
||||
ucs4 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
*out = '\0'; // null terminate
|
||||
mLength += utf8len;
|
||||
}
|
||||
}
|
||||
|
||||
NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )
|
||||
|
@ -431,24 +431,15 @@ class NS_COM NS_ConvertUCS2toUTF8
|
||||
*/
|
||||
{
|
||||
public:
|
||||
friend NS_COM char* ToNewUTF8String( const nsAString& aSource );
|
||||
|
||||
public:
|
||||
explicit
|
||||
NS_ConvertUCS2toUTF8( const PRUnichar* aString )
|
||||
explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString );
|
||||
NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength );
|
||||
explicit NS_ConvertUCS2toUTF8( const nsAString& aString )
|
||||
{
|
||||
Append( aString, ~PRUint32(0) /* MAXINT */);
|
||||
Init(aString);
|
||||
}
|
||||
|
||||
NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
|
||||
{
|
||||
Append( aString, aLength );
|
||||
}
|
||||
|
||||
explicit NS_ConvertUCS2toUTF8( const nsAString& aString );
|
||||
|
||||
protected:
|
||||
void Append( const PRUnichar* aString, PRUint32 aLength );
|
||||
void Init( const nsAString& aString );
|
||||
|
||||
private:
|
||||
// NOT TO BE IMPLEMENTED
|
||||
|
@ -54,8 +54,10 @@
|
||||
#include "prdtoa.h"
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
static const char* kPossibleNull = "Error: possible unintended null in string";
|
||||
static const char* kNullPointerError = "Error: unexpected null ptr";
|
||||
#endif
|
||||
static const char* kWhitespace="\b\t\r\n ";
|
||||
|
||||
const nsBufferHandle<PRUnichar>*
|
||||
|
@ -54,6 +54,10 @@ class UTF8traits
|
||||
#define PLANE1_BASE 0x00010000
|
||||
#define UCS2_REPLACEMENT_CHAR 0xfffd
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
||||
* UTF-8 to UCS2 (really UTF-16).
|
||||
*/
|
||||
class ConvertUTF8toUCS2
|
||||
{
|
||||
public:
|
||||
@ -181,12 +185,21 @@ class ConvertUTF8toUCS2
|
||||
return p - start;
|
||||
}
|
||||
|
||||
void write_terminator()
|
||||
{
|
||||
*mBuffer = buffer_type(0);
|
||||
}
|
||||
|
||||
private:
|
||||
buffer_type* mStart;
|
||||
buffer_type* mBuffer;
|
||||
PRBool mErrorEncountered;
|
||||
};
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
||||
* the length of a UTF-8 string.
|
||||
*/
|
||||
class CalculateUTF8Length
|
||||
{
|
||||
public:
|
||||
@ -242,4 +255,148 @@ class CalculateUTF8Length
|
||||
PRBool mErrorEncountered;
|
||||
};
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
||||
* UCS2 (really UTF-16) to UTF-8.
|
||||
*/
|
||||
class ConvertUCS2toUTF8
|
||||
{
|
||||
public:
|
||||
typedef nsAString::char_type value_type;
|
||||
typedef nsACString::char_type buffer_type;
|
||||
|
||||
// The error handling here is more lenient than that in
|
||||
// |ConvertUTF8toUCS2|, but it's that way for backwards
|
||||
// compatibility.
|
||||
|
||||
ConvertUCS2toUTF8( buffer_type* aBuffer )
|
||||
: mStart(aBuffer), mBuffer(aBuffer) {}
|
||||
|
||||
size_t Size() const { return mBuffer - mStart; }
|
||||
|
||||
PRUint32 write( const value_type* start, PRUint32 N )
|
||||
{
|
||||
for (const value_type *p = start, *end = start + N; p < end; ++p )
|
||||
{
|
||||
value_type c = *p;
|
||||
if (! (c & 0xFF80)) // U+0000 - U+007F
|
||||
{
|
||||
*mBuffer++ = (char)c;
|
||||
}
|
||||
else if (! (c & 0xF800)) // U+0100 - U+07FF
|
||||
{
|
||||
*mBuffer++ = 0xC0 | (char)(c >> 6);
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & c);
|
||||
}
|
||||
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
||||
{
|
||||
// D800- DBFF - High Surrogate
|
||||
// N = (H- D800) *400 + 10000 + ...
|
||||
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
|
||||
|
||||
++p;
|
||||
if (p == end)
|
||||
{
|
||||
NS_ERROR("Surrogate pair split between fragments");
|
||||
return N;
|
||||
}
|
||||
c = *p;
|
||||
|
||||
if (0xDC00 == (0xFC00 & c))
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
// N += ( L - DC00 )
|
||||
ucs4 |= (0x03FF & c);
|
||||
|
||||
// 0001 0000-001F FFFF
|
||||
*mBuffer++ = 0xF0 | (char)(ucs4 >> 18);
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & ucs4) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
NS_ERROR("got a High Surrogate but no low surrogate");
|
||||
// output nothing.
|
||||
}
|
||||
}
|
||||
else if (0xDC00 == (0xFC00 & c)) // U+DC00 - U+DFFF
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
NS_ERROR("got a low Surrogate but no high surrogate");
|
||||
// output nothing.
|
||||
}
|
||||
else // U+0800 - U+D7FF, U+E000 - U+FFFF
|
||||
{
|
||||
*mBuffer++ = 0xE0 | (char)(c >> 12);
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & (c >> 6));
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & c );
|
||||
}
|
||||
}
|
||||
|
||||
return N;
|
||||
}
|
||||
|
||||
void write_terminator()
|
||||
{
|
||||
*mBuffer = buffer_type(0);
|
||||
}
|
||||
|
||||
private:
|
||||
buffer_type* mStart;
|
||||
buffer_type* mBuffer;
|
||||
};
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
||||
* the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8.
|
||||
*/
|
||||
class CalculateUTF8Size
|
||||
{
|
||||
public:
|
||||
typedef nsAString::char_type value_type;
|
||||
|
||||
CalculateUTF8Size()
|
||||
: mSize(0) { }
|
||||
|
||||
size_t Size() const { return mSize; }
|
||||
|
||||
PRUint32 write( const value_type* start, PRUint32 N )
|
||||
{
|
||||
// Assume UCS2 surrogate pairs won't be spread across fragments.
|
||||
for (const value_type *p = start, *end = start + N; p < end; ++p )
|
||||
{
|
||||
value_type c = *p;
|
||||
if (! (c & 0xFF80)) // U+0000 - U+007F
|
||||
mSize += 1;
|
||||
else if (! (c & 0xF800)) // U+0100 - U+07FF
|
||||
mSize += 2;
|
||||
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
||||
{
|
||||
++p;
|
||||
if (p == end)
|
||||
{
|
||||
NS_ERROR("Surrogate pair split between fragments");
|
||||
return N;
|
||||
}
|
||||
c = *p;
|
||||
|
||||
if (0xDC00 == (0xFC00 & c))
|
||||
mSize += 4;
|
||||
else
|
||||
NS_ERROR("got a high Surrogate but no low surrogate");
|
||||
}
|
||||
else if (0xDC00 == (0xFC00 & c)) // U+DC00 - U+DFFF
|
||||
NS_ERROR("got a low Surrogate but no high surrogate");
|
||||
else // U+0800 - U+D7FF, U+E000 - U+FFFF
|
||||
mSize += 3;
|
||||
}
|
||||
|
||||
return N;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t mSize;
|
||||
};
|
||||
|
||||
#endif /* !defined(nsUTF8Utils_h_) */
|
||||
|
@ -209,24 +209,18 @@ NS_COM
|
||||
char*
|
||||
ToNewUTF8String( const nsAString& aSource )
|
||||
{
|
||||
// XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
|
||||
// refactored so that we can use it here without a double-copy.
|
||||
NS_ConvertUCS2toUTF8 temp(aSource);
|
||||
nsAString::const_iterator start, end;
|
||||
CalculateUTF8Size calculator;
|
||||
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
|
||||
calculator);
|
||||
|
||||
char* result;
|
||||
if (temp.GetOwnsBuffer()) {
|
||||
// We allocated. Trick the string into not freeing its buffer to
|
||||
// avoid an extra allocation.
|
||||
result = temp.mStr;
|
||||
char *result = NS_STATIC_CAST(char*,
|
||||
nsMemory::Alloc(calculator.Size() + 1));
|
||||
|
||||
temp.mStr=0;
|
||||
temp.SetOwnsBuffer(PR_FALSE);
|
||||
}
|
||||
else {
|
||||
// We didn't allocate a buffer, so we need to copy it out of the
|
||||
// nsCAutoString's storage.
|
||||
result = ToNewCString(temp);
|
||||
}
|
||||
ConvertUCS2toUTF8 converter(result);
|
||||
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
|
||||
converter).write_terminator();
|
||||
NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -285,7 +279,7 @@ UTF8ToNewUnicode( const nsACString& aSource )
|
||||
|
||||
ConvertUTF8toUCS2 converter(result);
|
||||
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
|
||||
converter);
|
||||
converter).write_terminator();
|
||||
NS_ASSERTION(calculator.Length() == converter.Length(), "length mismatch");
|
||||
|
||||
return result;
|
||||
|
@ -45,6 +45,7 @@
|
||||
#include "nsString.h"
|
||||
#include "nsReadableUtils.h"
|
||||
#include "nsDebug.h"
|
||||
#include "nsUTF8Utils.h"
|
||||
|
||||
#ifndef nsCharTraits_h___
|
||||
#include "nsCharTraits.h"
|
||||
@ -54,8 +55,10 @@
|
||||
#include "prdtoa.h"
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
static const char* kPossibleNull = "Error: possible unintended null in string";
|
||||
static const char* kNullPointerError = "Error: unexpected null ptr";
|
||||
#endif
|
||||
static const char* kWhitespace="\b\t\r\n ";
|
||||
|
||||
const nsBufferHandle<char>*
|
||||
@ -1084,111 +1087,47 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
|
||||
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
|
||||
{
|
||||
nsAString::const_iterator start; aString.BeginReading(start);
|
||||
nsAString::const_iterator end; aString.EndReading(end);
|
||||
|
||||
while (start != end) {
|
||||
nsReadableFragment<PRUnichar> frag(start.fragment());
|
||||
Append(frag.mStart, frag.mEnd - frag.mStart);
|
||||
start.advance(start.size_forward());
|
||||
}
|
||||
if (!aString)
|
||||
// Leave us as an uninitialized nsCAutoString.
|
||||
return;
|
||||
Init(nsDependentString(aString));
|
||||
}
|
||||
|
||||
void
|
||||
NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
|
||||
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
|
||||
{
|
||||
// Handle null string by just leaving us as a brand-new
|
||||
// uninitialized nsCAutoString.
|
||||
if (! aString)
|
||||
if (!aString)
|
||||
// Leave us as an uninitialized nsCAutoString.
|
||||
return;
|
||||
Init(Substring(aString, aString + aLength));
|
||||
}
|
||||
|
||||
// Calculate how many bytes we need
|
||||
const PRUnichar* p;
|
||||
PRInt32 count, utf8len;
|
||||
for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
|
||||
{
|
||||
if (! ((*p) & 0xFF80))
|
||||
utf8len += 1; // 0000 0000 - 0000 007F
|
||||
else if (! ((*p) & 0xF800))
|
||||
utf8len += 2; // 0000 0080 - 0000 07FF
|
||||
else
|
||||
utf8len += 3; // 0000 0800 - 0000 FFFF
|
||||
// Note: Surrogate pair needs 4 bytes, but in this calcuation
|
||||
// we count it as 6 bytes. It will waste 2 bytes per surrogate pair
|
||||
void NS_ConvertUCS2toUTF8::Init( const nsAString& aString )
|
||||
{
|
||||
// Compute space required: do this once so we don't incur multiple
|
||||
// allocations. This "optimization" is probably of dubious value...
|
||||
|
||||
nsAString::const_iterator start, end;
|
||||
CalculateUTF8Size calculator;
|
||||
copy_string(aString.BeginReading(start), aString.EndReading(end), calculator);
|
||||
|
||||
PRUint32 count = calculator.Size();
|
||||
|
||||
if (count) {
|
||||
// Grow the buffer if we need to.
|
||||
SetLength(count);
|
||||
|
||||
// All ready? Time to convert
|
||||
|
||||
ConvertUCS2toUTF8 converter(mStr);
|
||||
copy_string(aString.BeginReading(start), aString.EndReading(end), converter);
|
||||
mLength = converter.Size();
|
||||
if (mLength != count) {
|
||||
NS_ERROR("Input invalid or incorrect length was calculated");
|
||||
Truncate();
|
||||
}
|
||||
|
||||
// Make sure our buffer's big enough, so we don't need to do
|
||||
// multiple allocations.
|
||||
if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
|
||||
SetCapacity(mLength+utf8len+1);
|
||||
// |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
|
||||
// we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
|
||||
|
||||
char* out = mStr+mLength;
|
||||
PRUint32 ucs4=0;
|
||||
|
||||
for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
|
||||
{
|
||||
if (0 == ucs4)
|
||||
{
|
||||
if (! ((*p) & 0xFF80))
|
||||
{
|
||||
*out++ = (char)*p;
|
||||
}
|
||||
else if (! ((*p) & 0xF800))
|
||||
{
|
||||
*out++ = 0xC0 | (char)((*p) >> 6);
|
||||
*out++ = 0x80 | (char)(0x003F & (*p));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (0xD800 == (0xFC00 & (*p)))
|
||||
{
|
||||
// D800- DBFF - High Surrogate
|
||||
// N = (H- D800) *400 + 10000 + ...
|
||||
ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
|
||||
}
|
||||
else if (0xDC00 == (0xFC00 & (*p)))
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
// error here. We should hit High Surrogate first
|
||||
// Do not output any thing in this case
|
||||
}
|
||||
else
|
||||
{
|
||||
*out++ = 0xE0 | (char)((*p) >> 12);
|
||||
*out++ = 0x80 | (char)(0x003F & (*p >> 6));
|
||||
*out++ = 0x80 | (char)(0x003F & (*p) );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (0xDC00 == (0xFC00 & (*p)))
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
// N += ( L - DC00 )
|
||||
ucs4 |= (0x03FF & (*p));
|
||||
|
||||
// 0001 0000-001F FFFF
|
||||
*out++ = 0xF0 | (char)(ucs4 >> 18);
|
||||
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
|
||||
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
|
||||
*out++ = 0x80 | (char)(0x003F & ucs4) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Got a High Surrogate but no low surrogate
|
||||
// output nothing.
|
||||
}
|
||||
ucs4 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
*out = '\0'; // null terminate
|
||||
mLength += utf8len;
|
||||
}
|
||||
}
|
||||
|
||||
NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )
|
||||
|
@ -431,24 +431,15 @@ class NS_COM NS_ConvertUCS2toUTF8
|
||||
*/
|
||||
{
|
||||
public:
|
||||
friend NS_COM char* ToNewUTF8String( const nsAString& aSource );
|
||||
|
||||
public:
|
||||
explicit
|
||||
NS_ConvertUCS2toUTF8( const PRUnichar* aString )
|
||||
explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString );
|
||||
NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength );
|
||||
explicit NS_ConvertUCS2toUTF8( const nsAString& aString )
|
||||
{
|
||||
Append( aString, ~PRUint32(0) /* MAXINT */);
|
||||
Init(aString);
|
||||
}
|
||||
|
||||
NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
|
||||
{
|
||||
Append( aString, aLength );
|
||||
}
|
||||
|
||||
explicit NS_ConvertUCS2toUTF8( const nsAString& aString );
|
||||
|
||||
protected:
|
||||
void Append( const PRUnichar* aString, PRUint32 aLength );
|
||||
void Init( const nsAString& aString );
|
||||
|
||||
private:
|
||||
// NOT TO BE IMPLEMENTED
|
||||
|
@ -54,8 +54,10 @@
|
||||
#include "prdtoa.h"
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
static const char* kPossibleNull = "Error: possible unintended null in string";
|
||||
static const char* kNullPointerError = "Error: unexpected null ptr";
|
||||
#endif
|
||||
static const char* kWhitespace="\b\t\r\n ";
|
||||
|
||||
const nsBufferHandle<PRUnichar>*
|
||||
|
@ -54,6 +54,10 @@ class UTF8traits
|
||||
#define PLANE1_BASE 0x00010000
|
||||
#define UCS2_REPLACEMENT_CHAR 0xfffd
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
||||
* UTF-8 to UCS2 (really UTF-16).
|
||||
*/
|
||||
class ConvertUTF8toUCS2
|
||||
{
|
||||
public:
|
||||
@ -181,12 +185,21 @@ class ConvertUTF8toUCS2
|
||||
return p - start;
|
||||
}
|
||||
|
||||
void write_terminator()
|
||||
{
|
||||
*mBuffer = buffer_type(0);
|
||||
}
|
||||
|
||||
private:
|
||||
buffer_type* mStart;
|
||||
buffer_type* mBuffer;
|
||||
PRBool mErrorEncountered;
|
||||
};
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
||||
* the length of a UTF-8 string.
|
||||
*/
|
||||
class CalculateUTF8Length
|
||||
{
|
||||
public:
|
||||
@ -242,4 +255,148 @@ class CalculateUTF8Length
|
||||
PRBool mErrorEncountered;
|
||||
};
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
||||
* UCS2 (really UTF-16) to UTF-8.
|
||||
*/
|
||||
class ConvertUCS2toUTF8
|
||||
{
|
||||
public:
|
||||
typedef nsAString::char_type value_type;
|
||||
typedef nsACString::char_type buffer_type;
|
||||
|
||||
// The error handling here is more lenient than that in
|
||||
// |ConvertUTF8toUCS2|, but it's that way for backwards
|
||||
// compatibility.
|
||||
|
||||
ConvertUCS2toUTF8( buffer_type* aBuffer )
|
||||
: mStart(aBuffer), mBuffer(aBuffer) {}
|
||||
|
||||
size_t Size() const { return mBuffer - mStart; }
|
||||
|
||||
PRUint32 write( const value_type* start, PRUint32 N )
|
||||
{
|
||||
for (const value_type *p = start, *end = start + N; p < end; ++p )
|
||||
{
|
||||
value_type c = *p;
|
||||
if (! (c & 0xFF80)) // U+0000 - U+007F
|
||||
{
|
||||
*mBuffer++ = (char)c;
|
||||
}
|
||||
else if (! (c & 0xF800)) // U+0100 - U+07FF
|
||||
{
|
||||
*mBuffer++ = 0xC0 | (char)(c >> 6);
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & c);
|
||||
}
|
||||
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
||||
{
|
||||
// D800- DBFF - High Surrogate
|
||||
// N = (H- D800) *400 + 10000 + ...
|
||||
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
|
||||
|
||||
++p;
|
||||
if (p == end)
|
||||
{
|
||||
NS_ERROR("Surrogate pair split between fragments");
|
||||
return N;
|
||||
}
|
||||
c = *p;
|
||||
|
||||
if (0xDC00 == (0xFC00 & c))
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
// N += ( L - DC00 )
|
||||
ucs4 |= (0x03FF & c);
|
||||
|
||||
// 0001 0000-001F FFFF
|
||||
*mBuffer++ = 0xF0 | (char)(ucs4 >> 18);
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & ucs4) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
NS_ERROR("got a High Surrogate but no low surrogate");
|
||||
// output nothing.
|
||||
}
|
||||
}
|
||||
else if (0xDC00 == (0xFC00 & c)) // U+DC00 - U+DFFF
|
||||
{
|
||||
// DC00- DFFF - Low Surrogate
|
||||
NS_ERROR("got a low Surrogate but no high surrogate");
|
||||
// output nothing.
|
||||
}
|
||||
else // U+0800 - U+D7FF, U+E000 - U+FFFF
|
||||
{
|
||||
*mBuffer++ = 0xE0 | (char)(c >> 12);
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & (c >> 6));
|
||||
*mBuffer++ = 0x80 | (char)(0x003F & c );
|
||||
}
|
||||
}
|
||||
|
||||
return N;
|
||||
}
|
||||
|
||||
void write_terminator()
|
||||
{
|
||||
*mBuffer = buffer_type(0);
|
||||
}
|
||||
|
||||
private:
|
||||
buffer_type* mStart;
|
||||
buffer_type* mBuffer;
|
||||
};
|
||||
|
||||
/**
|
||||
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
||||
* the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8.
|
||||
*/
|
||||
class CalculateUTF8Size
|
||||
{
|
||||
public:
|
||||
typedef nsAString::char_type value_type;
|
||||
|
||||
CalculateUTF8Size()
|
||||
: mSize(0) { }
|
||||
|
||||
size_t Size() const { return mSize; }
|
||||
|
||||
PRUint32 write( const value_type* start, PRUint32 N )
|
||||
{
|
||||
// Assume UCS2 surrogate pairs won't be spread across fragments.
|
||||
for (const value_type *p = start, *end = start + N; p < end; ++p )
|
||||
{
|
||||
value_type c = *p;
|
||||
if (! (c & 0xFF80)) // U+0000 - U+007F
|
||||
mSize += 1;
|
||||
else if (! (c & 0xF800)) // U+0100 - U+07FF
|
||||
mSize += 2;
|
||||
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
||||
{
|
||||
++p;
|
||||
if (p == end)
|
||||
{
|
||||
NS_ERROR("Surrogate pair split between fragments");
|
||||
return N;
|
||||
}
|
||||
c = *p;
|
||||
|
||||
if (0xDC00 == (0xFC00 & c))
|
||||
mSize += 4;
|
||||
else
|
||||
NS_ERROR("got a high Surrogate but no low surrogate");
|
||||
}
|
||||
else if (0xDC00 == (0xFC00 & c)) // U+DC00 - U+DFFF
|
||||
NS_ERROR("got a low Surrogate but no high surrogate");
|
||||
else // U+0800 - U+D7FF, U+E000 - U+FFFF
|
||||
mSize += 3;
|
||||
}
|
||||
|
||||
return N;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t mSize;
|
||||
};
|
||||
|
||||
#endif /* !defined(nsUTF8Utils_h_) */
|
||||
|
@ -209,24 +209,18 @@ NS_COM
|
||||
char*
|
||||
ToNewUTF8String( const nsAString& aSource )
|
||||
{
|
||||
// XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
|
||||
// refactored so that we can use it here without a double-copy.
|
||||
NS_ConvertUCS2toUTF8 temp(aSource);
|
||||
nsAString::const_iterator start, end;
|
||||
CalculateUTF8Size calculator;
|
||||
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
|
||||
calculator);
|
||||
|
||||
char* result;
|
||||
if (temp.GetOwnsBuffer()) {
|
||||
// We allocated. Trick the string into not freeing its buffer to
|
||||
// avoid an extra allocation.
|
||||
result = temp.mStr;
|
||||
char *result = NS_STATIC_CAST(char*,
|
||||
nsMemory::Alloc(calculator.Size() + 1));
|
||||
|
||||
temp.mStr=0;
|
||||
temp.SetOwnsBuffer(PR_FALSE);
|
||||
}
|
||||
else {
|
||||
// We didn't allocate a buffer, so we need to copy it out of the
|
||||
// nsCAutoString's storage.
|
||||
result = ToNewCString(temp);
|
||||
}
|
||||
ConvertUCS2toUTF8 converter(result);
|
||||
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
|
||||
converter).write_terminator();
|
||||
NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -285,7 +279,7 @@ UTF8ToNewUnicode( const nsACString& aSource )
|
||||
|
||||
ConvertUTF8toUCS2 converter(result);
|
||||
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
|
||||
converter);
|
||||
converter).write_terminator();
|
||||
NS_ASSERTION(calculator.Length() == converter.Length(), "length mismatch");
|
||||
|
||||
return result;
|
||||
|
Loading…
x
Reference in New Issue
Block a user