diff --git a/string/obsolete/nsString.cpp b/string/obsolete/nsString.cpp index 5bda2b7a827b..d12c7549d792 100644 --- a/string/obsolete/nsString.cpp +++ b/string/obsolete/nsString.cpp @@ -45,6 +45,7 @@ #include "nsString.h" #include "nsReadableUtils.h" #include "nsDebug.h" +#include "nsUTF8Utils.h" #ifndef nsCharTraits_h___ #include "nsCharTraits.h" @@ -1086,111 +1087,86 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P //---------------------------------------------------------------------- -NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString ) +NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString ) { - nsAString::const_iterator start; aString.BeginReading(start); - nsAString::const_iterator end; aString.EndReading(end); - - while (start != end) { - nsReadableFragment frag(start.fragment()); - Append(frag.mStart, frag.mEnd - frag.mStart); - start.advance(start.size_forward()); - } + if (!aString) + // Leave us as an uninitialized nsCAutoString. + return; + Init(aString, nsCharTraits::length(aString)); } -void -NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength ) +NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength ) { - // Handle null string by just leaving us as a brand-new - // uninitialized nsCAutoString. - if (! aString) + if (!aString) + // Leave us as an uninitialized nsCAutoString. return; + Init(aString, aLength); + } - // Calculate how many bytes we need - const PRUnichar* p; - PRInt32 count, utf8len; - for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++) +NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString ) + { + nsASingleFragmentString::const_char_iterator start; + Init(aString.BeginReading(start), aString.Length()); + } + +NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString ) + { + // Compute space required: do this once so we don't incur multiple + // allocations. This "optimization" is probably of dubious value... + + nsAString::const_iterator start, end; + CalculateUTF8Size calculator; + copy_string(aString.BeginReading(start), aString.EndReading(end), + calculator); + + PRUint32 count = calculator.Size(); + + if (count) { - if (! ((*p) & 0xFF80)) - utf8len += 1; // 0000 0000 - 0000 007F - else if (! ((*p) & 0xF800)) - utf8len += 2; // 0000 0080 - 0000 07FF - else - utf8len += 3; // 0000 0800 - 0000 FFFF - // Note: Surrogate pair needs 4 bytes, but in this calcuation - // we count it as 6 bytes. It will waste 2 bytes per surrogate pair - } + // Grow the buffer if we need to. + SetCapacity(count); - // Make sure our buffer's big enough, so we don't need to do - // multiple allocations. - if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer)) - SetCapacity(mLength+utf8len+1); - // |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h), - // we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want + // All ready? Time to convert - char* out = mStr+mLength; - PRUint32 ucs4=0; - - for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++) - { - if (0 == ucs4) + ConvertUCS2toUTF8 converter(mStr); + copy_string(aString.BeginReading(start), aString.EndReading(end), + converter).write_terminator(); + mLength = converter.Size(); + if (mLength != count) { - if (! ((*p) & 0xFF80)) - { - *out++ = (char)*p; - } - else if (! ((*p) & 0xF800)) - { - *out++ = 0xC0 | (char)((*p) >> 6); - *out++ = 0x80 | (char)(0x003F & (*p)); - } - else - { - if (0xD800 == (0xFC00 & (*p))) - { - // D800- DBFF - High Surrogate - // N = (H- D800) *400 + 10000 + ... - ucs4 = 0x10000 | ((0x03FF & (*p)) << 10); - } - else if (0xDC00 == (0xFC00 & (*p))) - { - // DC00- DFFF - Low Surrogate - // error here. We should hit High Surrogate first - // Do not output any thing in this case - } - else - { - *out++ = 0xE0 | (char)((*p) >> 12); - *out++ = 0x80 | (char)(0x003F & (*p >> 6)); - *out++ = 0x80 | (char)(0x003F & (*p) ); - } - } - } - else - { - if (0xDC00 == (0xFC00 & (*p))) - { - // DC00- DFFF - Low Surrogate - // N += ( L - DC00 ) - ucs4 |= (0x03FF & (*p)); - - // 0001 0000-001F FFFF - *out++ = 0xF0 | (char)(ucs4 >> 18); - *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); - *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); - *out++ = 0x80 | (char)(0x003F & ucs4) ; - } - else - { - // Got a High Surrogate but no low surrogate - // output nothing. - } - ucs4 = 0; + NS_ERROR("Input invalid or incorrect length was calculated"); + Truncate(); } } + } - *out = '\0'; // null terminate - mLength += utf8len; +void NS_ConvertUCS2toUTF8::Init( const PRUnichar* aString, PRUint32 aLength ) + { + // Compute space required: do this once so we don't incur multiple + // allocations. This "optimization" is probably of dubious value... + + CalculateUTF8Size calculator; + calculator.write(aString, aLength); + + PRUint32 count = calculator.Size(); + + if (count) + { + // Grow the buffer if we need to. + SetCapacity(count); + + // All ready? Time to convert + + ConvertUCS2toUTF8 converter(mStr); + converter.write(aString, aLength); + mLength = converter.Size(); + mStr[mLength] = char_type(0); + if (mLength != count) + { + NS_ERROR("Input invalid or incorrect length was calculated"); + Truncate(); + } + } } NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString ) diff --git a/string/obsolete/nsString.h b/string/obsolete/nsString.h index 5ce0fe71a1e6..9d5636bd0582 100644 --- a/string/obsolete/nsString.h +++ b/string/obsolete/nsString.h @@ -431,24 +431,13 @@ class NS_COM NS_ConvertUCS2toUTF8 */ { public: - friend NS_COM char* ToNewUTF8String( const nsAString& aSource ); - - public: - explicit - NS_ConvertUCS2toUTF8( const PRUnichar* aString ) - { - Append( aString, ~PRUint32(0) /* MAXINT */); - } - - NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength ) - { - Append( aString, aLength ); - } - + explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString ); + NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength ); explicit NS_ConvertUCS2toUTF8( const nsAString& aString ); + explicit NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString ); protected: - void Append( const PRUnichar* aString, PRUint32 aLength ); + void Init( const PRUnichar* aString, PRUint32 aLength ); private: // NOT TO BE IMPLEMENTED diff --git a/string/obsolete/nsString2.cpp b/string/obsolete/nsString2.cpp index ef304455a1e4..239c3e70c60d 100644 --- a/string/obsolete/nsString2.cpp +++ b/string/obsolete/nsString2.cpp @@ -1351,34 +1351,82 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString ) } } -void -NS_ConvertUTF8toUCS2::Init( const nsACString& aCString ) -{ - // Compute space required: do this once so we don't incur multiple - // allocations. This "optimization" is probably of dubious value... +NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsACString& aCString ) + { + // Compute space required: do this once so we don't incur multiple + // allocations. This "optimization" is probably of dubious value... - nsACString::const_iterator start, end; - CalculateUTF8Length calculator; - copy_string(aCString.BeginReading(start), aCString.EndReading(end), calculator); + nsACString::const_iterator start, end; + CalculateUTF8Length calculator; + copy_string(aCString.BeginReading(start), aCString.EndReading(end), + calculator); - PRUint32 count = calculator.Length(); + PRUint32 count = calculator.Length(); - if (count) { - // Grow the buffer if we need to. - SetLength(count); + if (count) + { + // Grow the buffer if we need to. + SetCapacity(count); - // All ready? Time to convert + // All ready? Time to convert - ConvertUTF8toUCS2 converter(mUStr); - copy_string(aCString.BeginReading(start), aCString.EndReading(end), converter); - mLength = converter.Length(); - if (mLength != count) { - NS_ERROR("Input wasn't UTF8 or incorrect length was calculated"); - Truncate(); - } + ConvertUTF8toUCS2 converter(mUStr); + copy_string(aCString.BeginReading(start), aCString.EndReading(end), + converter).write_terminator(); + mLength = converter.Length(); + if (mLength != count) + { + NS_ERROR("Input wasn't UTF8 or incorrect length was calculated"); + Truncate(); + } + } } -} +NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString ) + { + nsASingleFragmentCString::const_char_iterator start; + Init(aCString.BeginReading(start), aCString.Length()); + } + +NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString ) + { + Init(aCString, nsCharTraits::length(aCString)); + } + +NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength ) + { + Init(aCString, aLength); + } + +void +NS_ConvertUTF8toUCS2::Init( const char* aCString, PRUint32 aLength ) + { + // Compute space required: do this once so we don't incur multiple + // allocations. This "optimization" is probably of dubious value... + + CalculateUTF8Length calculator; + calculator.write(aCString, aLength); + + PRUint32 count = calculator.Length(); + + if (count) + { + // Grow the buffer if we need to. + SetCapacity(count); + + // All ready? Time to convert + + ConvertUTF8toUCS2 converter(mUStr); + converter.write(aCString, aLength); + mLength = converter.Length(); + mUStr[mLength] = char_type(0); + if (mLength != count) + { + NS_ERROR("Input invalid or incorrect length was calculated"); + Truncate(); + } + } + } /** * Default copy constructor diff --git a/string/obsolete/nsString2.h b/string/obsolete/nsString2.h index 65de493a9655..8d96c43800e4 100644 --- a/string/obsolete/nsString2.h +++ b/string/obsolete/nsString2.h @@ -530,25 +530,13 @@ class NS_COM NS_ConvertUTF8toUCS2 : public nsAutoString { public: - explicit - NS_ConvertUTF8toUCS2( const nsACString& aCString ) - { - Init( aCString ); - } - - explicit - NS_ConvertUTF8toUCS2( const char* aCString ) - { - Init( nsDependentCString( aCString ) ); - } - - NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength ) - { - Init( Substring( aCString, aCString + aLength ) ); - } + explicit NS_ConvertUTF8toUCS2( const nsACString& aCString ); + explicit NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString ); + explicit NS_ConvertUTF8toUCS2( const char* aCString ); + NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength ); protected: - void Init( const nsACString& aCString ); + void Init( const char* aCString, PRUint32 aLength ); private: NS_ConvertUTF8toUCS2( PRUnichar ); diff --git a/string/public/nsUTF8Utils.h b/string/public/nsUTF8Utils.h index e31478cb287d..435b45aeea93 100644 --- a/string/public/nsUTF8Utils.h +++ b/string/public/nsUTF8Utils.h @@ -54,6 +54,12 @@ class UTF8traits #define PLANE1_BASE 0x00010000 #define UCS2_REPLACEMENT_CHAR 0xfffd +#ifdef __GNUC__ +#define NS_ALWAYS_INLINE __attribute__((always_inline)) +#else +#define NS_ALWAYS_INLINE +#endif + /** * A character sink (see |copy_string| in nsAlgorithm.h) for converting * UTF-8 to UCS2 (really UTF-16). @@ -69,7 +75,7 @@ class ConvertUTF8toUCS2 size_t Length() const { return mBuffer - mStart; } - PRUint32 write( const value_type* start, PRUint32 N ) + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) { if ( mErrorEncountered ) return N; @@ -78,13 +84,14 @@ class ConvertUTF8toUCS2 // be spread across fragments const value_type* p = start; const value_type* end = start + N; + buffer_type* out = mBuffer; for ( ; p != end /* && *p */; ) { char c = *p++; if ( UTF8traits::isASCII(c) ) { - *mBuffer++ = buffer_type(c); + *out++ = buffer_type(c); continue; } @@ -126,6 +133,7 @@ class ConvertUTF8toUCS2 { NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); mErrorEncountered = PR_TRUE; + mBuffer = out; return N; } @@ -142,6 +150,7 @@ class ConvertUTF8toUCS2 { NS_ERROR("not a UTF8 string"); mErrorEncountered = PR_TRUE; + mBuffer = out; return N; } } @@ -149,39 +158,40 @@ class ConvertUTF8toUCS2 if ( ucs4 < minUcs4 ) { // Overlong sequence - *mBuffer++ = UCS2_REPLACEMENT_CHAR; + *out++ = UCS2_REPLACEMENT_CHAR; } else if ( ucs4 <= 0xD7FF ) { - *mBuffer++ = ucs4; + *out++ = ucs4; } else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF ) { // Surrogates - *mBuffer++ = UCS2_REPLACEMENT_CHAR; + *out++ = UCS2_REPLACEMENT_CHAR; } else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF ) { // Prohibited characters - *mBuffer++ = UCS2_REPLACEMENT_CHAR; + *out++ = UCS2_REPLACEMENT_CHAR; } else if ( ucs4 >= PLANE1_BASE ) { if ( ucs4 >= 0x00110000 ) - *mBuffer++ = UCS2_REPLACEMENT_CHAR; + *out++ = UCS2_REPLACEMENT_CHAR; else { // surrogate, see unicode specification 3.7 for following math. ucs4 -= PLANE1_BASE; - *mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u; - *mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u; + *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u; + *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u; } } else { if ( ucs4 != 0xFEFF ) // ignore BOM - *mBuffer++ = ucs4; + *out++ = ucs4; } } + mBuffer = out; return p - start; } @@ -191,7 +201,7 @@ class ConvertUTF8toUCS2 } private: - buffer_type* mStart; + buffer_type* const mStart; buffer_type* mBuffer; PRBool mErrorEncountered; }; @@ -209,7 +219,7 @@ class CalculateUTF8Length size_t Length() const { return mLength; } - PRUint32 write( const value_type* start, PRUint32 N ) + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) { // ignore any further requests if ( mErrorEncountered ) @@ -255,4 +265,152 @@ class CalculateUTF8Length PRBool mErrorEncountered; }; +/** + * A character sink (see |copy_string| in nsAlgorithm.h) for converting + * UCS2 (really UTF-16) to UTF-8. + */ +class ConvertUCS2toUTF8 + { + public: + typedef nsAString::char_type value_type; + typedef nsACString::char_type buffer_type; + + // The error handling here is more lenient than that in + // |ConvertUTF8toUCS2|, but it's that way for backwards + // compatibility. + + ConvertUCS2toUTF8( buffer_type* aBuffer ) + : mStart(aBuffer), mBuffer(aBuffer) {} + + size_t Size() const { return mBuffer - mStart; } + + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) + { + buffer_type *out = mBuffer; // gcc isn't smart enough to do this! + + for (const value_type *p = start, *end = start + N; p < end; ++p ) + { + value_type c = *p; + if (! (c & 0xFF80)) // U+0000 - U+007F + { + *out++ = (char)c; + } + else if (! (c & 0xF800)) // U+0100 - U+07FF + { + *out++ = 0xC0 | (char)(c >> 6); + *out++ = 0x80 | (char)(0x003F & c); + } + else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF + { + *out++ = 0xE0 | (char)(c >> 12); + *out++ = 0x80 | (char)(0x003F & (c >> 6)); + *out++ = 0x80 | (char)(0x003F & c ); + } + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + { + // D800- DBFF - High Surrogate + // N = (H- D800) *400 + 10000 + ... + PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10); + + ++p; + if (p == end) + { + NS_ERROR("Surrogate pair split between fragments"); + mBuffer = out; + return N; + } + c = *p; + + if (0xDC00 == (0xFC00 & c)) + { + // DC00- DFFF - Low Surrogate + // N += ( L - DC00 ) + ucs4 |= (0x03FF & c); + + // 0001 0000-001F FFFF + *out++ = 0xF0 | (char)(ucs4 >> 18); + *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); + *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); + *out++ = 0x80 | (char)(0x003F & ucs4); + } + else + { + NS_ERROR("got a High Surrogate but no low surrogate"); + // output nothing. + } + } + else // U+DC00 - U+DFFF + { + // DC00- DFFF - Low Surrogate + NS_ERROR("got a low Surrogate but no high surrogate"); + // output nothing. + } + } + + mBuffer = out; + return N; + } + + void write_terminator() + { + *mBuffer = buffer_type(0); + } + + private: + buffer_type* const mStart; + buffer_type* mBuffer; + }; + +/** + * A character sink (see |copy_string| in nsAlgorithm.h) for computing + * the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8. + */ +class CalculateUTF8Size + { + public: + typedef nsAString::char_type value_type; + + CalculateUTF8Size() + : mSize(0) { } + + size_t Size() const { return mSize; } + + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) + { + // Assume UCS2 surrogate pairs won't be spread across fragments. + for (const value_type *p = start, *end = start + N; p < end; ++p ) + { + value_type c = *p; + if (! (c & 0xFF80)) // U+0000 - U+007F + mSize += 1; + else if (! (c & 0xF800)) // U+0100 - U+07FF + mSize += 2; + else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF + mSize += 3; + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + { + ++p; + if (p == end) + { + NS_ERROR("Surrogate pair split between fragments"); + return N; + } + c = *p; + + if (0xDC00 == (0xFC00 & c)) + mSize += 4; + else + NS_ERROR("got a high Surrogate but no low surrogate"); + } + else // U+DC00 - U+DFFF + NS_ERROR("got a low Surrogate but no high surrogate"); + } + + return N; + } + + private: + size_t mSize; + }; + #endif /* !defined(nsUTF8Utils_h_) */ diff --git a/string/src/nsReadableUtils.cpp b/string/src/nsReadableUtils.cpp index 873e1bcfc50b..3ad5e821f5eb 100755 --- a/string/src/nsReadableUtils.cpp +++ b/string/src/nsReadableUtils.cpp @@ -245,24 +245,18 @@ NS_COM char* ToNewUTF8String( const nsAString& aSource ) { - // XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be - // refactored so that we can use it here without a double-copy. - NS_ConvertUCS2toUTF8 temp(aSource); + nsAString::const_iterator start, end; + CalculateUTF8Size calculator; + copy_string(aSource.BeginReading(start), aSource.EndReading(end), + calculator); - char* result; - if (temp.GetOwnsBuffer()) { - // We allocated. Trick the string into not freeing its buffer to - // avoid an extra allocation. - result = temp.mStr; + char *result = NS_STATIC_CAST(char*, + nsMemory::Alloc(calculator.Size() + 1)); - temp.mStr=0; - temp.SetOwnsBuffer(PR_FALSE); - } - else { - // We didn't allocate a buffer, so we need to copy it out of the - // nsCAutoString's storage. - result = ToNewCString(temp); - } + ConvertUCS2toUTF8 converter(result); + copy_string(aSource.BeginReading(start), aSource.EndReading(end), + converter).write_terminator(); + NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch"); return result; } diff --git a/xpcom/string/obsolete/nsString.cpp b/xpcom/string/obsolete/nsString.cpp index 5bda2b7a827b..d12c7549d792 100644 --- a/xpcom/string/obsolete/nsString.cpp +++ b/xpcom/string/obsolete/nsString.cpp @@ -45,6 +45,7 @@ #include "nsString.h" #include "nsReadableUtils.h" #include "nsDebug.h" +#include "nsUTF8Utils.h" #ifndef nsCharTraits_h___ #include "nsCharTraits.h" @@ -1086,111 +1087,86 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P //---------------------------------------------------------------------- -NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString ) +NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString ) { - nsAString::const_iterator start; aString.BeginReading(start); - nsAString::const_iterator end; aString.EndReading(end); - - while (start != end) { - nsReadableFragment frag(start.fragment()); - Append(frag.mStart, frag.mEnd - frag.mStart); - start.advance(start.size_forward()); - } + if (!aString) + // Leave us as an uninitialized nsCAutoString. + return; + Init(aString, nsCharTraits::length(aString)); } -void -NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength ) +NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength ) { - // Handle null string by just leaving us as a brand-new - // uninitialized nsCAutoString. - if (! aString) + if (!aString) + // Leave us as an uninitialized nsCAutoString. return; + Init(aString, aLength); + } - // Calculate how many bytes we need - const PRUnichar* p; - PRInt32 count, utf8len; - for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++) +NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString ) + { + nsASingleFragmentString::const_char_iterator start; + Init(aString.BeginReading(start), aString.Length()); + } + +NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString ) + { + // Compute space required: do this once so we don't incur multiple + // allocations. This "optimization" is probably of dubious value... + + nsAString::const_iterator start, end; + CalculateUTF8Size calculator; + copy_string(aString.BeginReading(start), aString.EndReading(end), + calculator); + + PRUint32 count = calculator.Size(); + + if (count) { - if (! ((*p) & 0xFF80)) - utf8len += 1; // 0000 0000 - 0000 007F - else if (! ((*p) & 0xF800)) - utf8len += 2; // 0000 0080 - 0000 07FF - else - utf8len += 3; // 0000 0800 - 0000 FFFF - // Note: Surrogate pair needs 4 bytes, but in this calcuation - // we count it as 6 bytes. It will waste 2 bytes per surrogate pair - } + // Grow the buffer if we need to. + SetCapacity(count); - // Make sure our buffer's big enough, so we don't need to do - // multiple allocations. - if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer)) - SetCapacity(mLength+utf8len+1); - // |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h), - // we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want + // All ready? Time to convert - char* out = mStr+mLength; - PRUint32 ucs4=0; - - for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++) - { - if (0 == ucs4) + ConvertUCS2toUTF8 converter(mStr); + copy_string(aString.BeginReading(start), aString.EndReading(end), + converter).write_terminator(); + mLength = converter.Size(); + if (mLength != count) { - if (! ((*p) & 0xFF80)) - { - *out++ = (char)*p; - } - else if (! ((*p) & 0xF800)) - { - *out++ = 0xC0 | (char)((*p) >> 6); - *out++ = 0x80 | (char)(0x003F & (*p)); - } - else - { - if (0xD800 == (0xFC00 & (*p))) - { - // D800- DBFF - High Surrogate - // N = (H- D800) *400 + 10000 + ... - ucs4 = 0x10000 | ((0x03FF & (*p)) << 10); - } - else if (0xDC00 == (0xFC00 & (*p))) - { - // DC00- DFFF - Low Surrogate - // error here. We should hit High Surrogate first - // Do not output any thing in this case - } - else - { - *out++ = 0xE0 | (char)((*p) >> 12); - *out++ = 0x80 | (char)(0x003F & (*p >> 6)); - *out++ = 0x80 | (char)(0x003F & (*p) ); - } - } - } - else - { - if (0xDC00 == (0xFC00 & (*p))) - { - // DC00- DFFF - Low Surrogate - // N += ( L - DC00 ) - ucs4 |= (0x03FF & (*p)); - - // 0001 0000-001F FFFF - *out++ = 0xF0 | (char)(ucs4 >> 18); - *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); - *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); - *out++ = 0x80 | (char)(0x003F & ucs4) ; - } - else - { - // Got a High Surrogate but no low surrogate - // output nothing. - } - ucs4 = 0; + NS_ERROR("Input invalid or incorrect length was calculated"); + Truncate(); } } + } - *out = '\0'; // null terminate - mLength += utf8len; +void NS_ConvertUCS2toUTF8::Init( const PRUnichar* aString, PRUint32 aLength ) + { + // Compute space required: do this once so we don't incur multiple + // allocations. This "optimization" is probably of dubious value... + + CalculateUTF8Size calculator; + calculator.write(aString, aLength); + + PRUint32 count = calculator.Size(); + + if (count) + { + // Grow the buffer if we need to. + SetCapacity(count); + + // All ready? Time to convert + + ConvertUCS2toUTF8 converter(mStr); + converter.write(aString, aLength); + mLength = converter.Size(); + mStr[mLength] = char_type(0); + if (mLength != count) + { + NS_ERROR("Input invalid or incorrect length was calculated"); + Truncate(); + } + } } NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString ) diff --git a/xpcom/string/obsolete/nsString.h b/xpcom/string/obsolete/nsString.h index 5ce0fe71a1e6..9d5636bd0582 100644 --- a/xpcom/string/obsolete/nsString.h +++ b/xpcom/string/obsolete/nsString.h @@ -431,24 +431,13 @@ class NS_COM NS_ConvertUCS2toUTF8 */ { public: - friend NS_COM char* ToNewUTF8String( const nsAString& aSource ); - - public: - explicit - NS_ConvertUCS2toUTF8( const PRUnichar* aString ) - { - Append( aString, ~PRUint32(0) /* MAXINT */); - } - - NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength ) - { - Append( aString, aLength ); - } - + explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString ); + NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength ); explicit NS_ConvertUCS2toUTF8( const nsAString& aString ); + explicit NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString ); protected: - void Append( const PRUnichar* aString, PRUint32 aLength ); + void Init( const PRUnichar* aString, PRUint32 aLength ); private: // NOT TO BE IMPLEMENTED diff --git a/xpcom/string/obsolete/nsString2.cpp b/xpcom/string/obsolete/nsString2.cpp index ef304455a1e4..239c3e70c60d 100644 --- a/xpcom/string/obsolete/nsString2.cpp +++ b/xpcom/string/obsolete/nsString2.cpp @@ -1351,34 +1351,82 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString ) } } -void -NS_ConvertUTF8toUCS2::Init( const nsACString& aCString ) -{ - // Compute space required: do this once so we don't incur multiple - // allocations. This "optimization" is probably of dubious value... +NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsACString& aCString ) + { + // Compute space required: do this once so we don't incur multiple + // allocations. This "optimization" is probably of dubious value... - nsACString::const_iterator start, end; - CalculateUTF8Length calculator; - copy_string(aCString.BeginReading(start), aCString.EndReading(end), calculator); + nsACString::const_iterator start, end; + CalculateUTF8Length calculator; + copy_string(aCString.BeginReading(start), aCString.EndReading(end), + calculator); - PRUint32 count = calculator.Length(); + PRUint32 count = calculator.Length(); - if (count) { - // Grow the buffer if we need to. - SetLength(count); + if (count) + { + // Grow the buffer if we need to. + SetCapacity(count); - // All ready? Time to convert + // All ready? Time to convert - ConvertUTF8toUCS2 converter(mUStr); - copy_string(aCString.BeginReading(start), aCString.EndReading(end), converter); - mLength = converter.Length(); - if (mLength != count) { - NS_ERROR("Input wasn't UTF8 or incorrect length was calculated"); - Truncate(); - } + ConvertUTF8toUCS2 converter(mUStr); + copy_string(aCString.BeginReading(start), aCString.EndReading(end), + converter).write_terminator(); + mLength = converter.Length(); + if (mLength != count) + { + NS_ERROR("Input wasn't UTF8 or incorrect length was calculated"); + Truncate(); + } + } } -} +NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString ) + { + nsASingleFragmentCString::const_char_iterator start; + Init(aCString.BeginReading(start), aCString.Length()); + } + +NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString ) + { + Init(aCString, nsCharTraits::length(aCString)); + } + +NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength ) + { + Init(aCString, aLength); + } + +void +NS_ConvertUTF8toUCS2::Init( const char* aCString, PRUint32 aLength ) + { + // Compute space required: do this once so we don't incur multiple + // allocations. This "optimization" is probably of dubious value... + + CalculateUTF8Length calculator; + calculator.write(aCString, aLength); + + PRUint32 count = calculator.Length(); + + if (count) + { + // Grow the buffer if we need to. + SetCapacity(count); + + // All ready? Time to convert + + ConvertUTF8toUCS2 converter(mUStr); + converter.write(aCString, aLength); + mLength = converter.Length(); + mUStr[mLength] = char_type(0); + if (mLength != count) + { + NS_ERROR("Input invalid or incorrect length was calculated"); + Truncate(); + } + } + } /** * Default copy constructor diff --git a/xpcom/string/obsolete/nsString2.h b/xpcom/string/obsolete/nsString2.h index 65de493a9655..8d96c43800e4 100644 --- a/xpcom/string/obsolete/nsString2.h +++ b/xpcom/string/obsolete/nsString2.h @@ -530,25 +530,13 @@ class NS_COM NS_ConvertUTF8toUCS2 : public nsAutoString { public: - explicit - NS_ConvertUTF8toUCS2( const nsACString& aCString ) - { - Init( aCString ); - } - - explicit - NS_ConvertUTF8toUCS2( const char* aCString ) - { - Init( nsDependentCString( aCString ) ); - } - - NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength ) - { - Init( Substring( aCString, aCString + aLength ) ); - } + explicit NS_ConvertUTF8toUCS2( const nsACString& aCString ); + explicit NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString ); + explicit NS_ConvertUTF8toUCS2( const char* aCString ); + NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength ); protected: - void Init( const nsACString& aCString ); + void Init( const char* aCString, PRUint32 aLength ); private: NS_ConvertUTF8toUCS2( PRUnichar ); diff --git a/xpcom/string/public/nsUTF8Utils.h b/xpcom/string/public/nsUTF8Utils.h index e31478cb287d..435b45aeea93 100644 --- a/xpcom/string/public/nsUTF8Utils.h +++ b/xpcom/string/public/nsUTF8Utils.h @@ -54,6 +54,12 @@ class UTF8traits #define PLANE1_BASE 0x00010000 #define UCS2_REPLACEMENT_CHAR 0xfffd +#ifdef __GNUC__ +#define NS_ALWAYS_INLINE __attribute__((always_inline)) +#else +#define NS_ALWAYS_INLINE +#endif + /** * A character sink (see |copy_string| in nsAlgorithm.h) for converting * UTF-8 to UCS2 (really UTF-16). @@ -69,7 +75,7 @@ class ConvertUTF8toUCS2 size_t Length() const { return mBuffer - mStart; } - PRUint32 write( const value_type* start, PRUint32 N ) + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) { if ( mErrorEncountered ) return N; @@ -78,13 +84,14 @@ class ConvertUTF8toUCS2 // be spread across fragments const value_type* p = start; const value_type* end = start + N; + buffer_type* out = mBuffer; for ( ; p != end /* && *p */; ) { char c = *p++; if ( UTF8traits::isASCII(c) ) { - *mBuffer++ = buffer_type(c); + *out++ = buffer_type(c); continue; } @@ -126,6 +133,7 @@ class ConvertUTF8toUCS2 { NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); mErrorEncountered = PR_TRUE; + mBuffer = out; return N; } @@ -142,6 +150,7 @@ class ConvertUTF8toUCS2 { NS_ERROR("not a UTF8 string"); mErrorEncountered = PR_TRUE; + mBuffer = out; return N; } } @@ -149,39 +158,40 @@ class ConvertUTF8toUCS2 if ( ucs4 < minUcs4 ) { // Overlong sequence - *mBuffer++ = UCS2_REPLACEMENT_CHAR; + *out++ = UCS2_REPLACEMENT_CHAR; } else if ( ucs4 <= 0xD7FF ) { - *mBuffer++ = ucs4; + *out++ = ucs4; } else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF ) { // Surrogates - *mBuffer++ = UCS2_REPLACEMENT_CHAR; + *out++ = UCS2_REPLACEMENT_CHAR; } else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF ) { // Prohibited characters - *mBuffer++ = UCS2_REPLACEMENT_CHAR; + *out++ = UCS2_REPLACEMENT_CHAR; } else if ( ucs4 >= PLANE1_BASE ) { if ( ucs4 >= 0x00110000 ) - *mBuffer++ = UCS2_REPLACEMENT_CHAR; + *out++ = UCS2_REPLACEMENT_CHAR; else { // surrogate, see unicode specification 3.7 for following math. ucs4 -= PLANE1_BASE; - *mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u; - *mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u; + *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u; + *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u; } } else { if ( ucs4 != 0xFEFF ) // ignore BOM - *mBuffer++ = ucs4; + *out++ = ucs4; } } + mBuffer = out; return p - start; } @@ -191,7 +201,7 @@ class ConvertUTF8toUCS2 } private: - buffer_type* mStart; + buffer_type* const mStart; buffer_type* mBuffer; PRBool mErrorEncountered; }; @@ -209,7 +219,7 @@ class CalculateUTF8Length size_t Length() const { return mLength; } - PRUint32 write( const value_type* start, PRUint32 N ) + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) { // ignore any further requests if ( mErrorEncountered ) @@ -255,4 +265,152 @@ class CalculateUTF8Length PRBool mErrorEncountered; }; +/** + * A character sink (see |copy_string| in nsAlgorithm.h) for converting + * UCS2 (really UTF-16) to UTF-8. + */ +class ConvertUCS2toUTF8 + { + public: + typedef nsAString::char_type value_type; + typedef nsACString::char_type buffer_type; + + // The error handling here is more lenient than that in + // |ConvertUTF8toUCS2|, but it's that way for backwards + // compatibility. + + ConvertUCS2toUTF8( buffer_type* aBuffer ) + : mStart(aBuffer), mBuffer(aBuffer) {} + + size_t Size() const { return mBuffer - mStart; } + + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) + { + buffer_type *out = mBuffer; // gcc isn't smart enough to do this! + + for (const value_type *p = start, *end = start + N; p < end; ++p ) + { + value_type c = *p; + if (! (c & 0xFF80)) // U+0000 - U+007F + { + *out++ = (char)c; + } + else if (! (c & 0xF800)) // U+0100 - U+07FF + { + *out++ = 0xC0 | (char)(c >> 6); + *out++ = 0x80 | (char)(0x003F & c); + } + else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF + { + *out++ = 0xE0 | (char)(c >> 12); + *out++ = 0x80 | (char)(0x003F & (c >> 6)); + *out++ = 0x80 | (char)(0x003F & c ); + } + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + { + // D800- DBFF - High Surrogate + // N = (H- D800) *400 + 10000 + ... + PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10); + + ++p; + if (p == end) + { + NS_ERROR("Surrogate pair split between fragments"); + mBuffer = out; + return N; + } + c = *p; + + if (0xDC00 == (0xFC00 & c)) + { + // DC00- DFFF - Low Surrogate + // N += ( L - DC00 ) + ucs4 |= (0x03FF & c); + + // 0001 0000-001F FFFF + *out++ = 0xF0 | (char)(ucs4 >> 18); + *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); + *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); + *out++ = 0x80 | (char)(0x003F & ucs4); + } + else + { + NS_ERROR("got a High Surrogate but no low surrogate"); + // output nothing. + } + } + else // U+DC00 - U+DFFF + { + // DC00- DFFF - Low Surrogate + NS_ERROR("got a low Surrogate but no high surrogate"); + // output nothing. + } + } + + mBuffer = out; + return N; + } + + void write_terminator() + { + *mBuffer = buffer_type(0); + } + + private: + buffer_type* const mStart; + buffer_type* mBuffer; + }; + +/** + * A character sink (see |copy_string| in nsAlgorithm.h) for computing + * the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8. + */ +class CalculateUTF8Size + { + public: + typedef nsAString::char_type value_type; + + CalculateUTF8Size() + : mSize(0) { } + + size_t Size() const { return mSize; } + + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) + { + // Assume UCS2 surrogate pairs won't be spread across fragments. + for (const value_type *p = start, *end = start + N; p < end; ++p ) + { + value_type c = *p; + if (! (c & 0xFF80)) // U+0000 - U+007F + mSize += 1; + else if (! (c & 0xF800)) // U+0100 - U+07FF + mSize += 2; + else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF + mSize += 3; + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + { + ++p; + if (p == end) + { + NS_ERROR("Surrogate pair split between fragments"); + return N; + } + c = *p; + + if (0xDC00 == (0xFC00 & c)) + mSize += 4; + else + NS_ERROR("got a high Surrogate but no low surrogate"); + } + else // U+DC00 - U+DFFF + NS_ERROR("got a low Surrogate but no high surrogate"); + } + + return N; + } + + private: + size_t mSize; + }; + #endif /* !defined(nsUTF8Utils_h_) */ diff --git a/xpcom/string/src/nsReadableUtils.cpp b/xpcom/string/src/nsReadableUtils.cpp index 873e1bcfc50b..3ad5e821f5eb 100755 --- a/xpcom/string/src/nsReadableUtils.cpp +++ b/xpcom/string/src/nsReadableUtils.cpp @@ -245,24 +245,18 @@ NS_COM char* ToNewUTF8String( const nsAString& aSource ) { - // XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be - // refactored so that we can use it here without a double-copy. - NS_ConvertUCS2toUTF8 temp(aSource); + nsAString::const_iterator start, end; + CalculateUTF8Size calculator; + copy_string(aSource.BeginReading(start), aSource.EndReading(end), + calculator); - char* result; - if (temp.GetOwnsBuffer()) { - // We allocated. Trick the string into not freeing its buffer to - // avoid an extra allocation. - result = temp.mStr; + char *result = NS_STATIC_CAST(char*, + nsMemory::Alloc(calculator.Size() + 1)); - temp.mStr=0; - temp.SetOwnsBuffer(PR_FALSE); - } - else { - // We didn't allocate a buffer, so we need to copy it out of the - // nsCAutoString's storage. - result = ToNewCString(temp); - } + ConvertUCS2toUTF8 converter(result); + copy_string(aSource.BeginReading(start), aSource.EndReading(end), + converter).write_terminator(); + NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch"); return result; }