Move the core of NS_ConvertUCS2toUTF8 into character sinks in nsUTF8Utils.h, and use them to make ToNewUTF8String faster. Fix bug in surrogate handling in the moved code. Make various tweaks to improve performance of conversion between UCS2 and UTF-8 (both ways). b=206682 r=jag sr=jst

2024-11-26 06:11:37 +00:00 · 2003-06-11 04:27:13 +00:00 · 2003-06-11 04:27:13 +00:00 · 18b8c334fb
commit 18b8c334fb
parent f88fdaee47
12 changed files with 652 additions and 346 deletions
--- a/string/obsolete/nsString.cpp
+++ b/string/obsolete/nsString.cpp
@ -45,6 +45,7 @@
 #include "nsString.h"
 #include "nsReadableUtils.h"
 #include "nsDebug.h"
+#include "nsUTF8Utils.h"

 #ifndef nsCharTraits_h___
 #include "nsCharTraits.h"
@ -1086,111 +1087,86 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P

 //----------------------------------------------------------------------

-NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
  {
-    nsAString::const_iterator start; aString.BeginReading(start);
-    nsAString::const_iterator end;   aString.EndReading(end);
-    
-    while (start != end) {
-      nsReadableFragment<PRUnichar> frag(start.fragment());
-      Append(frag.mStart, frag.mEnd - frag.mStart);
-      start.advance(start.size_forward());
-    }
+    if (!aString)
+      // Leave us as an uninitialized nsCAutoString.
+      return;
+    Init(aString, nsCharTraits<PRUnichar>::length(aString));
  }

-void
-NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
  {
-    // Handle null string by just leaving us as a brand-new
-    // uninitialized nsCAutoString.
-    if (! aString)
+    if (!aString)
+      // Leave us as an uninitialized nsCAutoString.
      return;
+    Init(aString, aLength);
+  }

-    // Calculate how many bytes we need
-    const PRUnichar* p;
-    PRInt32 count, utf8len;
-    for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString )
+  {
+    nsASingleFragmentString::const_char_iterator start;
+    Init(aString.BeginReading(start), aString.Length());
+  }
+
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...
+
+    nsAString::const_iterator start, end;
+    CalculateUTF8Size calculator;
+    copy_string(aString.BeginReading(start), aString.EndReading(end),
+                calculator);
+
+    PRUint32 count = calculator.Size();
+
+    if (count)
      {
-        if (! ((*p) & 0xFF80))
-          utf8len += 1; // 0000 0000 - 0000 007F
-        else if (! ((*p) & 0xF800))
-          utf8len += 2; // 0000 0080 - 0000 07FF
-        else 
-          utf8len += 3; // 0000 0800 - 0000 FFFF
-        // Note: Surrogate pair needs 4 bytes, but in this calcuation
-        // we count it as 6 bytes. It will waste 2 bytes per surrogate pair
-      }
+        // Grow the buffer if we need to.
+        SetCapacity(count);

-    // Make sure our buffer's big enough, so we don't need to do
-    // multiple allocations.
-    if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
-      SetCapacity(mLength+utf8len+1);
-    // |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
-    //  we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
+        // All ready? Time to convert

-    char* out = mStr+mLength;
-    PRUint32 ucs4=0;
-
-    for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
-      {
-        if (0 == ucs4)
+        ConvertUCS2toUTF8 converter(mStr);
+        copy_string(aString.BeginReading(start), aString.EndReading(end),
+                    converter).write_terminator();
+        mLength = converter.Size();
+        if (mLength != count)
          {
-            if (! ((*p) & 0xFF80))
-              {
-                *out++ = (char)*p;
-              } 
-            else if (! ((*p) & 0xF800))
-              {
-                *out++ = 0xC0 | (char)((*p) >> 6);
-                *out++ = 0x80 | (char)(0x003F & (*p));
-              }
-            else
-              {
-                if (0xD800 == (0xFC00 & (*p))) 
-                  {
-                    // D800- DBFF - High Surrogate 
-                    // N = (H- D800) *400 + 10000 + ...
-                    ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
-                  }
-                else if (0xDC00 == (0xFC00 & (*p)))
-                  { 
-                    // DC00- DFFF - Low Surrogate 
-                    // error here. We should hit High Surrogate first
-                    // Do not output any thing in this case
-                  }
-                else
-                  {
-                    *out++ = 0xE0 | (char)((*p) >> 12);
-                    *out++ = 0x80 | (char)(0x003F & (*p >> 6));
-                    *out++ = 0x80 | (char)(0x003F & (*p) );
-                  }
-              }
-          }
-        else
-          {
-            if (0xDC00 == (0xFC00 & (*p)))
-              { 
-                // DC00- DFFF - Low Surrogate 
-                // N += ( L - DC00 )  
-                ucs4 |= (0x03FF & (*p));
-
-                // 0001 0000-001F FFFF
-                *out++ = 0xF0 | (char)(ucs4 >> 18);
-                *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
-                *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
-                *out++ = 0x80 | (char)(0x003F & ucs4) ;
-              }
-            else
-              {
-                // Got a High Surrogate but no low surrogate
-                // output nothing.
-              }
-            ucs4 = 0;
+            NS_ERROR("Input invalid or incorrect length was calculated");
+            Truncate();
          }
      }
+  }

-    *out = '\0'; // null terminate
-    mLength += utf8len;
+void NS_ConvertUCS2toUTF8::Init( const PRUnichar* aString, PRUint32 aLength )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...
+
+    CalculateUTF8Size calculator;
+    calculator.write(aString, aLength);
+
+    PRUint32 count = calculator.Size();
+
+    if (count)
+      {
+        // Grow the buffer if we need to.
+        SetCapacity(count);
+
+        // All ready? Time to convert
+
+        ConvertUCS2toUTF8 converter(mStr);
+        converter.write(aString, aLength);
+        mLength = converter.Size();
+        mStr[mLength] = char_type(0);
+        if (mLength != count)
+          {
+            NS_ERROR("Input invalid or incorrect length was calculated");
+            Truncate();
+          }
+      }
  }

 NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )
--- a/string/obsolete/nsString.h
+++ b/string/obsolete/nsString.h
@ -431,24 +431,13 @@ class NS_COM NS_ConvertUCS2toUTF8
    */
  {
    public:
-      friend NS_COM char* ToNewUTF8String( const nsAString& aSource );
-
-    public:
-      explicit
-      NS_ConvertUCS2toUTF8( const PRUnichar* aString )
-        {
-          Append( aString, ~PRUint32(0) /* MAXINT */);
-        }
-
-      NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
-        {
-          Append( aString, aLength );
-        }
-
+      explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString );
+      NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength );
      explicit NS_ConvertUCS2toUTF8( const nsAString& aString );
+      explicit NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString );

    protected:
-      void Append( const PRUnichar* aString, PRUint32 aLength );
+      void Init( const PRUnichar* aString, PRUint32 aLength );

    private:
        // NOT TO BE IMPLEMENTED
--- a/string/obsolete/nsString2.cpp
+++ b/string/obsolete/nsString2.cpp
@ -1351,34 +1351,82 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString )
      }
  }

-void
-NS_ConvertUTF8toUCS2::Init( const nsACString& aCString )
-{
-  // Compute space required: do this once so we don't incur multiple
-  // allocations. This "optimization" is probably of dubious value...
+NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsACString& aCString )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...

-  nsACString::const_iterator start, end;
-  CalculateUTF8Length calculator;
-  copy_string(aCString.BeginReading(start), aCString.EndReading(end), calculator);
+    nsACString::const_iterator start, end;
+    CalculateUTF8Length calculator;
+    copy_string(aCString.BeginReading(start), aCString.EndReading(end),
+                calculator);

-  PRUint32 count = calculator.Length();
+    PRUint32 count = calculator.Length();

-  if (count) {
-    // Grow the buffer if we need to.
-    SetLength(count);
+    if (count)
+      {
+        // Grow the buffer if we need to.
+        SetCapacity(count);

-    // All ready? Time to convert
+        // All ready? Time to convert

-    ConvertUTF8toUCS2 converter(mUStr);
-    copy_string(aCString.BeginReading(start), aCString.EndReading(end), converter);
-    mLength = converter.Length();
-    if (mLength != count) {
-      NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
-      Truncate();
-    }
+        ConvertUTF8toUCS2 converter(mUStr);
+        copy_string(aCString.BeginReading(start), aCString.EndReading(end),
+                    converter).write_terminator();
+        mLength = converter.Length();
+        if (mLength != count)
+          {
+            NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
+            Truncate();
+          }
+      }
  }

-}
+NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString )
+  {
+    nsASingleFragmentCString::const_char_iterator start;
+    Init(aCString.BeginReading(start), aCString.Length());
+  }
+
+NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString )
+  {
+    Init(aCString, nsCharTraits<char>::length(aCString));
+  }
+
+NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
+  {
+    Init(aCString, aLength);
+  }
+
+void
+NS_ConvertUTF8toUCS2::Init( const char* aCString, PRUint32 aLength )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...
+
+    CalculateUTF8Length calculator;
+    calculator.write(aCString, aLength);
+
+    PRUint32 count = calculator.Length();
+
+    if (count)
+      {
+        // Grow the buffer if we need to.
+        SetCapacity(count);
+
+        // All ready? Time to convert
+
+        ConvertUTF8toUCS2 converter(mUStr);
+        converter.write(aCString, aLength);
+        mLength = converter.Length();
+        mUStr[mLength] = char_type(0);
+        if (mLength != count)
+          {
+            NS_ERROR("Input invalid or incorrect length was calculated");
+            Truncate();
+          }
+      }
+  }

 /**
 * Default copy constructor
--- a/string/obsolete/nsString2.h
+++ b/string/obsolete/nsString2.h
@ -530,25 +530,13 @@ class NS_COM NS_ConvertUTF8toUCS2
      : public nsAutoString
  {
    public:
-      explicit
-      NS_ConvertUTF8toUCS2( const nsACString& aCString )
-        {
-          Init( aCString );
-        }
-
-      explicit
-      NS_ConvertUTF8toUCS2( const char* aCString )
-        {
-          Init( nsDependentCString( aCString ) );
-        }
-
-      NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
-        {
-          Init( Substring( aCString, aCString + aLength ) );
-        }
+      explicit NS_ConvertUTF8toUCS2( const nsACString& aCString );
+      explicit NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString );
+      explicit NS_ConvertUTF8toUCS2( const char* aCString );
+      NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength );

    protected:
-      void Init( const nsACString& aCString );
+      void Init( const char* aCString, PRUint32 aLength );

    private:
      NS_ConvertUTF8toUCS2( PRUnichar );
--- a/string/public/nsUTF8Utils.h
+++ b/string/public/nsUTF8Utils.h
@ -54,6 +54,12 @@ class UTF8traits
 #define PLANE1_BASE           0x00010000  
 #define UCS2_REPLACEMENT_CHAR 0xfffd     

+#ifdef __GNUC__
+#define NS_ALWAYS_INLINE __attribute__((always_inline))
+#else
+#define NS_ALWAYS_INLINE
+#endif
+
 /**
 * A character sink (see |copy_string| in nsAlgorithm.h) for converting
 * UTF-8 to UCS2 (really UTF-16).
@ -69,7 +75,7 @@ class ConvertUTF8toUCS2

    size_t Length() const { return mBuffer - mStart; }

-    PRUint32 write( const value_type* start, PRUint32 N )
+    PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
      {
        if ( mErrorEncountered )
          return N;
@ -78,13 +84,14 @@ class ConvertUTF8toUCS2
        // be spread across fragments
        const value_type* p = start;
        const value_type* end = start + N;
+        buffer_type* out = mBuffer;
        for ( ; p != end /* && *p */; )
          {
            char c = *p++;

            if ( UTF8traits::isASCII(c) )
              {
-                *mBuffer++ = buffer_type(c);
+                *out++ = buffer_type(c);
                continue;
              }

@ -126,6 +133,7 @@ class ConvertUTF8toUCS2
              {
                NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
                mErrorEncountered = PR_TRUE;
+                mBuffer = out;
                return N;
              }

@ -142,6 +150,7 @@ class ConvertUTF8toUCS2
                  {
                    NS_ERROR("not a UTF8 string");
                    mErrorEncountered = PR_TRUE;
+                    mBuffer = out;
                    return N;
                  }
              }
@ -149,39 +158,40 @@ class ConvertUTF8toUCS2
            if ( ucs4 < minUcs4 )
              {
                // Overlong sequence
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                *out++ = UCS2_REPLACEMENT_CHAR;
              }
            else if ( ucs4 <= 0xD7FF )
              {
-                *mBuffer++ = ucs4;
+                *out++ = ucs4;
              }
            else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
              {
                // Surrogates
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                *out++ = UCS2_REPLACEMENT_CHAR;
              }
            else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
              {
                // Prohibited characters
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                *out++ = UCS2_REPLACEMENT_CHAR;
              }
            else if ( ucs4 >= PLANE1_BASE )
              {
                if ( ucs4 >= 0x00110000 )
-                  *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                  *out++ = UCS2_REPLACEMENT_CHAR;
                else {
                  // surrogate, see unicode specification 3.7 for following math.
                  ucs4 -= PLANE1_BASE;
-                  *mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
-                  *mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
+                  *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
+                  *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
                }
              }
            else
              {
                if ( ucs4 != 0xFEFF ) // ignore BOM
-                    *mBuffer++ = ucs4;
+                    *out++ = ucs4;
              }
          }
+        mBuffer = out;
        return p - start;
      }

@ -191,7 +201,7 @@ class ConvertUTF8toUCS2
      }

    private:
-      buffer_type* mStart;
+      buffer_type* const mStart;
      buffer_type* mBuffer;
      PRBool mErrorEncountered;
  };
@ -209,7 +219,7 @@ class CalculateUTF8Length

    size_t Length() const { return mLength; }

-    PRUint32 write( const value_type* start, PRUint32 N )
+    PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
      {
          // ignore any further requests
        if ( mErrorEncountered )
@ -255,4 +265,152 @@ class CalculateUTF8Length
      PRBool mErrorEncountered;
  };

+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UCS2 (really UTF-16) to UTF-8.
+ */
+class ConvertUCS2toUTF8
+  {
+    public:
+      typedef nsAString::char_type  value_type;
+      typedef nsACString::char_type buffer_type;
+
+    // The error handling here is more lenient than that in
+    // |ConvertUTF8toUCS2|, but it's that way for backwards
+    // compatibility.
+
+    ConvertUCS2toUTF8( buffer_type* aBuffer )
+        : mStart(aBuffer), mBuffer(aBuffer) {}
+
+    size_t Size() const { return mBuffer - mStart; }
+
+    PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
+      {
+        buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
+
+        for (const value_type *p = start, *end = start + N; p < end; ++p )
+          {
+            value_type c = *p;
+            if (! (c & 0xFF80)) // U+0000 - U+007F
+              {
+                *out++ = (char)c;
+              }
+            else if (! (c & 0xF800)) // U+0100 - U+07FF
+              {
+                *out++ = 0xC0 | (char)(c >> 6);
+                *out++ = 0x80 | (char)(0x003F & c);
+              }
+            else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
+              {
+                *out++ = 0xE0 | (char)(c >> 12);
+                *out++ = 0x80 | (char)(0x003F & (c >> 6));
+                *out++ = 0x80 | (char)(0x003F & c );
+              }
+            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+              {
+                // D800- DBFF - High Surrogate
+                // N = (H- D800) *400 + 10000 + ...
+                PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+
+                ++p;
+                if (p == end)
+                  {
+                    NS_ERROR("Surrogate pair split between fragments");
+                    mBuffer = out;
+                    return N;
+                  }
+                c = *p;
+
+                if (0xDC00 == (0xFC00 & c))
+                  {
+                    // DC00- DFFF - Low Surrogate
+                    // N += ( L - DC00 )
+                    ucs4 |= (0x03FF & c);
+
+                    // 0001 0000-001F FFFF
+                    *out++ = 0xF0 | (char)(ucs4 >> 18);
+                    *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
+                    *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
+                    *out++ = 0x80 | (char)(0x003F & ucs4);
+                  }
+                else
+                  {
+                    NS_ERROR("got a High Surrogate but no low surrogate");
+                    // output nothing.
+                  }
+              }
+            else // U+DC00 - U+DFFF
+              {
+                // DC00- DFFF - Low Surrogate
+                NS_ERROR("got a low Surrogate but no high surrogate");
+                // output nothing.
+              }
+          }
+
+        mBuffer = out;
+        return N;
+      }
+
+    void write_terminator()
+      {
+        *mBuffer = buffer_type(0);
+      }
+
+    private:
+      buffer_type* const mStart;
+      buffer_type* mBuffer;
+  };
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8.
+ */
+class CalculateUTF8Size
+  {
+    public:
+      typedef nsAString::char_type value_type;
+
+    CalculateUTF8Size()
+      : mSize(0) { }
+
+    size_t Size() const { return mSize; }
+
+    PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
+      {
+        // Assume UCS2 surrogate pairs won't be spread across fragments.
+        for (const value_type *p = start, *end = start + N; p < end; ++p )
+          {
+            value_type c = *p;
+            if (! (c & 0xFF80)) // U+0000 - U+007F
+              mSize += 1;
+            else if (! (c & 0xF800)) // U+0100 - U+07FF
+              mSize += 2;
+            else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
+              mSize += 3;
+            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+              {
+                ++p;
+                if (p == end)
+                  {
+                    NS_ERROR("Surrogate pair split between fragments");
+                    return N;
+                  }
+                c = *p;
+
+                if (0xDC00 == (0xFC00 & c))
+                  mSize += 4;
+                else
+                  NS_ERROR("got a high Surrogate but no low surrogate");
+              }
+            else // U+DC00 - U+DFFF
+              NS_ERROR("got a low Surrogate but no high surrogate");
+          }
+
+        return N;
+      }
+
+    private:
+      size_t mSize;
+  };
+
 #endif /* !defined(nsUTF8Utils_h_) */
--- a/string/src/nsReadableUtils.cpp
+++ b/string/src/nsReadableUtils.cpp
@ -245,24 +245,18 @@ NS_COM
 char*
 ToNewUTF8String( const nsAString& aSource )
  {
-    // XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
-    // refactored so that we can use it here without a double-copy.
-    NS_ConvertUCS2toUTF8 temp(aSource);
+    nsAString::const_iterator start, end;
+    CalculateUTF8Size calculator;
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                calculator);

-    char* result;
-    if (temp.GetOwnsBuffer()) {
-      // We allocated. Trick the string into not freeing its buffer to
-      // avoid an extra allocation.
-      result = temp.mStr;
+    char *result = NS_STATIC_CAST(char*,
+        nsMemory::Alloc(calculator.Size() + 1));

-      temp.mStr=0;
-      temp.SetOwnsBuffer(PR_FALSE);
-    }
-    else {
-      // We didn't allocate a buffer, so we need to copy it out of the
-      // nsCAutoString's storage.
-      result = ToNewCString(temp);
-    }
+    ConvertUCS2toUTF8 converter(result);
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                converter).write_terminator();
+    NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");

    return result;
  }
--- a/xpcom/string/obsolete/nsString.cpp
+++ b/xpcom/string/obsolete/nsString.cpp
@ -45,6 +45,7 @@
 #include "nsString.h"
 #include "nsReadableUtils.h"
 #include "nsDebug.h"
+#include "nsUTF8Utils.h"

 #ifndef nsCharTraits_h___
 #include "nsCharTraits.h"
@ -1086,111 +1087,86 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P

 //----------------------------------------------------------------------

-NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
  {
-    nsAString::const_iterator start; aString.BeginReading(start);
-    nsAString::const_iterator end;   aString.EndReading(end);
-    
-    while (start != end) {
-      nsReadableFragment<PRUnichar> frag(start.fragment());
-      Append(frag.mStart, frag.mEnd - frag.mStart);
-      start.advance(start.size_forward());
-    }
+    if (!aString)
+      // Leave us as an uninitialized nsCAutoString.
+      return;
+    Init(aString, nsCharTraits<PRUnichar>::length(aString));
  }

-void
-NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
  {
-    // Handle null string by just leaving us as a brand-new
-    // uninitialized nsCAutoString.
-    if (! aString)
+    if (!aString)
+      // Leave us as an uninitialized nsCAutoString.
      return;
+    Init(aString, aLength);
+  }

-    // Calculate how many bytes we need
-    const PRUnichar* p;
-    PRInt32 count, utf8len;
-    for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString )
+  {
+    nsASingleFragmentString::const_char_iterator start;
+    Init(aString.BeginReading(start), aString.Length());
+  }
+
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...
+
+    nsAString::const_iterator start, end;
+    CalculateUTF8Size calculator;
+    copy_string(aString.BeginReading(start), aString.EndReading(end),
+                calculator);
+
+    PRUint32 count = calculator.Size();
+
+    if (count)
      {
-        if (! ((*p) & 0xFF80))
-          utf8len += 1; // 0000 0000 - 0000 007F
-        else if (! ((*p) & 0xF800))
-          utf8len += 2; // 0000 0080 - 0000 07FF
-        else 
-          utf8len += 3; // 0000 0800 - 0000 FFFF
-        // Note: Surrogate pair needs 4 bytes, but in this calcuation
-        // we count it as 6 bytes. It will waste 2 bytes per surrogate pair
-      }
+        // Grow the buffer if we need to.
+        SetCapacity(count);

-    // Make sure our buffer's big enough, so we don't need to do
-    // multiple allocations.
-    if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
-      SetCapacity(mLength+utf8len+1);
-    // |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
-    //  we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
+        // All ready? Time to convert

-    char* out = mStr+mLength;
-    PRUint32 ucs4=0;
-
-    for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
-      {
-        if (0 == ucs4)
+        ConvertUCS2toUTF8 converter(mStr);
+        copy_string(aString.BeginReading(start), aString.EndReading(end),
+                    converter).write_terminator();
+        mLength = converter.Size();
+        if (mLength != count)
          {
-            if (! ((*p) & 0xFF80))
-              {
-                *out++ = (char)*p;
-              } 
-            else if (! ((*p) & 0xF800))
-              {
-                *out++ = 0xC0 | (char)((*p) >> 6);
-                *out++ = 0x80 | (char)(0x003F & (*p));
-              }
-            else
-              {
-                if (0xD800 == (0xFC00 & (*p))) 
-                  {
-                    // D800- DBFF - High Surrogate 
-                    // N = (H- D800) *400 + 10000 + ...
-                    ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
-                  }
-                else if (0xDC00 == (0xFC00 & (*p)))
-                  { 
-                    // DC00- DFFF - Low Surrogate 
-                    // error here. We should hit High Surrogate first
-                    // Do not output any thing in this case
-                  }
-                else
-                  {
-                    *out++ = 0xE0 | (char)((*p) >> 12);
-                    *out++ = 0x80 | (char)(0x003F & (*p >> 6));
-                    *out++ = 0x80 | (char)(0x003F & (*p) );
-                  }
-              }
-          }
-        else
-          {
-            if (0xDC00 == (0xFC00 & (*p)))
-              { 
-                // DC00- DFFF - Low Surrogate 
-                // N += ( L - DC00 )  
-                ucs4 |= (0x03FF & (*p));
-
-                // 0001 0000-001F FFFF
-                *out++ = 0xF0 | (char)(ucs4 >> 18);
-                *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
-                *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
-                *out++ = 0x80 | (char)(0x003F & ucs4) ;
-              }
-            else
-              {
-                // Got a High Surrogate but no low surrogate
-                // output nothing.
-              }
-            ucs4 = 0;
+            NS_ERROR("Input invalid or incorrect length was calculated");
+            Truncate();
          }
      }
+  }

-    *out = '\0'; // null terminate
-    mLength += utf8len;
+void NS_ConvertUCS2toUTF8::Init( const PRUnichar* aString, PRUint32 aLength )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...
+
+    CalculateUTF8Size calculator;
+    calculator.write(aString, aLength);
+
+    PRUint32 count = calculator.Size();
+
+    if (count)
+      {
+        // Grow the buffer if we need to.
+        SetCapacity(count);
+
+        // All ready? Time to convert
+
+        ConvertUCS2toUTF8 converter(mStr);
+        converter.write(aString, aLength);
+        mLength = converter.Size();
+        mStr[mLength] = char_type(0);
+        if (mLength != count)
+          {
+            NS_ERROR("Input invalid or incorrect length was calculated");
+            Truncate();
+          }
+      }
  }

 NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )
--- a/xpcom/string/obsolete/nsString.h
+++ b/xpcom/string/obsolete/nsString.h
@ -431,24 +431,13 @@ class NS_COM NS_ConvertUCS2toUTF8
    */
  {
    public:
-      friend NS_COM char* ToNewUTF8String( const nsAString& aSource );
-
-    public:
-      explicit
-      NS_ConvertUCS2toUTF8( const PRUnichar* aString )
-        {
-          Append( aString, ~PRUint32(0) /* MAXINT */);
-        }
-
-      NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
-        {
-          Append( aString, aLength );
-        }
-
+      explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString );
+      NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength );
      explicit NS_ConvertUCS2toUTF8( const nsAString& aString );
+      explicit NS_ConvertUCS2toUTF8( const nsASingleFragmentString& aString );

    protected:
-      void Append( const PRUnichar* aString, PRUint32 aLength );
+      void Init( const PRUnichar* aString, PRUint32 aLength );

    private:
        // NOT TO BE IMPLEMENTED
--- a/xpcom/string/obsolete/nsString2.cpp
+++ b/xpcom/string/obsolete/nsString2.cpp
@ -1351,34 +1351,82 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString )
      }
  }

-void
-NS_ConvertUTF8toUCS2::Init( const nsACString& aCString )
-{
-  // Compute space required: do this once so we don't incur multiple
-  // allocations. This "optimization" is probably of dubious value...
+NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsACString& aCString )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...

-  nsACString::const_iterator start, end;
-  CalculateUTF8Length calculator;
-  copy_string(aCString.BeginReading(start), aCString.EndReading(end), calculator);
+    nsACString::const_iterator start, end;
+    CalculateUTF8Length calculator;
+    copy_string(aCString.BeginReading(start), aCString.EndReading(end),
+                calculator);

-  PRUint32 count = calculator.Length();
+    PRUint32 count = calculator.Length();

-  if (count) {
-    // Grow the buffer if we need to.
-    SetLength(count);
+    if (count)
+      {
+        // Grow the buffer if we need to.
+        SetCapacity(count);

-    // All ready? Time to convert
+        // All ready? Time to convert

-    ConvertUTF8toUCS2 converter(mUStr);
-    copy_string(aCString.BeginReading(start), aCString.EndReading(end), converter);
-    mLength = converter.Length();
-    if (mLength != count) {
-      NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
-      Truncate();
-    }
+        ConvertUTF8toUCS2 converter(mUStr);
+        copy_string(aCString.BeginReading(start), aCString.EndReading(end),
+                    converter).write_terminator();
+        mLength = converter.Length();
+        if (mLength != count)
+          {
+            NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
+            Truncate();
+          }
+      }
  }

-}
+NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString )
+  {
+    nsASingleFragmentCString::const_char_iterator start;
+    Init(aCString.BeginReading(start), aCString.Length());
+  }
+
+NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString )
+  {
+    Init(aCString, nsCharTraits<char>::length(aCString));
+  }
+
+NS_ConvertUTF8toUCS2::NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
+  {
+    Init(aCString, aLength);
+  }
+
+void
+NS_ConvertUTF8toUCS2::Init( const char* aCString, PRUint32 aLength )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...
+
+    CalculateUTF8Length calculator;
+    calculator.write(aCString, aLength);
+
+    PRUint32 count = calculator.Length();
+
+    if (count)
+      {
+        // Grow the buffer if we need to.
+        SetCapacity(count);
+
+        // All ready? Time to convert
+
+        ConvertUTF8toUCS2 converter(mUStr);
+        converter.write(aCString, aLength);
+        mLength = converter.Length();
+        mUStr[mLength] = char_type(0);
+        if (mLength != count)
+          {
+            NS_ERROR("Input invalid or incorrect length was calculated");
+            Truncate();
+          }
+      }
+  }

 /**
 * Default copy constructor
--- a/xpcom/string/obsolete/nsString2.h
+++ b/xpcom/string/obsolete/nsString2.h
@ -530,25 +530,13 @@ class NS_COM NS_ConvertUTF8toUCS2
      : public nsAutoString
  {
    public:
-      explicit
-      NS_ConvertUTF8toUCS2( const nsACString& aCString )
-        {
-          Init( aCString );
-        }
-
-      explicit
-      NS_ConvertUTF8toUCS2( const char* aCString )
-        {
-          Init( nsDependentCString( aCString ) );
-        }
-
-      NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
-        {
-          Init( Substring( aCString, aCString + aLength ) );
-        }
+      explicit NS_ConvertUTF8toUCS2( const nsACString& aCString );
+      explicit NS_ConvertUTF8toUCS2( const nsASingleFragmentCString& aCString );
+      explicit NS_ConvertUTF8toUCS2( const char* aCString );
+      NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength );

    protected:
-      void Init( const nsACString& aCString );
+      void Init( const char* aCString, PRUint32 aLength );

    private:
      NS_ConvertUTF8toUCS2( PRUnichar );
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@ -54,6 +54,12 @@ class UTF8traits
 #define PLANE1_BASE           0x00010000  
 #define UCS2_REPLACEMENT_CHAR 0xfffd     

+#ifdef __GNUC__
+#define NS_ALWAYS_INLINE __attribute__((always_inline))
+#else
+#define NS_ALWAYS_INLINE
+#endif
+
 /**
 * A character sink (see |copy_string| in nsAlgorithm.h) for converting
 * UTF-8 to UCS2 (really UTF-16).
@ -69,7 +75,7 @@ class ConvertUTF8toUCS2

    size_t Length() const { return mBuffer - mStart; }

-    PRUint32 write( const value_type* start, PRUint32 N )
+    PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
      {
        if ( mErrorEncountered )
          return N;
@ -78,13 +84,14 @@ class ConvertUTF8toUCS2
        // be spread across fragments
        const value_type* p = start;
        const value_type* end = start + N;
+        buffer_type* out = mBuffer;
        for ( ; p != end /* && *p */; )
          {
            char c = *p++;

            if ( UTF8traits::isASCII(c) )
              {
-                *mBuffer++ = buffer_type(c);
+                *out++ = buffer_type(c);
                continue;
              }

@ -126,6 +133,7 @@ class ConvertUTF8toUCS2
              {
                NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
                mErrorEncountered = PR_TRUE;
+                mBuffer = out;
                return N;
              }

@ -142,6 +150,7 @@ class ConvertUTF8toUCS2
                  {
                    NS_ERROR("not a UTF8 string");
                    mErrorEncountered = PR_TRUE;
+                    mBuffer = out;
                    return N;
                  }
              }
@ -149,39 +158,40 @@ class ConvertUTF8toUCS2
            if ( ucs4 < minUcs4 )
              {
                // Overlong sequence
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                *out++ = UCS2_REPLACEMENT_CHAR;
              }
            else if ( ucs4 <= 0xD7FF )
              {
-                *mBuffer++ = ucs4;
+                *out++ = ucs4;
              }
            else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
              {
                // Surrogates
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                *out++ = UCS2_REPLACEMENT_CHAR;
              }
            else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
              {
                // Prohibited characters
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                *out++ = UCS2_REPLACEMENT_CHAR;
              }
            else if ( ucs4 >= PLANE1_BASE )
              {
                if ( ucs4 >= 0x00110000 )
-                  *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                  *out++ = UCS2_REPLACEMENT_CHAR;
                else {
                  // surrogate, see unicode specification 3.7 for following math.
                  ucs4 -= PLANE1_BASE;
-                  *mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
-                  *mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
+                  *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
+                  *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
                }
              }
            else
              {
                if ( ucs4 != 0xFEFF ) // ignore BOM
-                    *mBuffer++ = ucs4;
+                    *out++ = ucs4;
              }
          }
+        mBuffer = out;
        return p - start;
      }

@ -191,7 +201,7 @@ class ConvertUTF8toUCS2
      }

    private:
-      buffer_type* mStart;
+      buffer_type* const mStart;
      buffer_type* mBuffer;
      PRBool mErrorEncountered;
  };
@ -209,7 +219,7 @@ class CalculateUTF8Length

    size_t Length() const { return mLength; }

-    PRUint32 write( const value_type* start, PRUint32 N )
+    PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
      {
          // ignore any further requests
        if ( mErrorEncountered )
@ -255,4 +265,152 @@ class CalculateUTF8Length
      PRBool mErrorEncountered;
  };

+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UCS2 (really UTF-16) to UTF-8.
+ */
+class ConvertUCS2toUTF8
+  {
+    public:
+      typedef nsAString::char_type  value_type;
+      typedef nsACString::char_type buffer_type;
+
+    // The error handling here is more lenient than that in
+    // |ConvertUTF8toUCS2|, but it's that way for backwards
+    // compatibility.
+
+    ConvertUCS2toUTF8( buffer_type* aBuffer )
+        : mStart(aBuffer), mBuffer(aBuffer) {}
+
+    size_t Size() const { return mBuffer - mStart; }
+
+    PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
+      {
+        buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
+
+        for (const value_type *p = start, *end = start + N; p < end; ++p )
+          {
+            value_type c = *p;
+            if (! (c & 0xFF80)) // U+0000 - U+007F
+              {
+                *out++ = (char)c;
+              }
+            else if (! (c & 0xF800)) // U+0100 - U+07FF
+              {
+                *out++ = 0xC0 | (char)(c >> 6);
+                *out++ = 0x80 | (char)(0x003F & c);
+              }
+            else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
+              {
+                *out++ = 0xE0 | (char)(c >> 12);
+                *out++ = 0x80 | (char)(0x003F & (c >> 6));
+                *out++ = 0x80 | (char)(0x003F & c );
+              }
+            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+              {
+                // D800- DBFF - High Surrogate
+                // N = (H- D800) *400 + 10000 + ...
+                PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+
+                ++p;
+                if (p == end)
+                  {
+                    NS_ERROR("Surrogate pair split between fragments");
+                    mBuffer = out;
+                    return N;
+                  }
+                c = *p;
+
+                if (0xDC00 == (0xFC00 & c))
+                  {
+                    // DC00- DFFF - Low Surrogate
+                    // N += ( L - DC00 )
+                    ucs4 |= (0x03FF & c);
+
+                    // 0001 0000-001F FFFF
+                    *out++ = 0xF0 | (char)(ucs4 >> 18);
+                    *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
+                    *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
+                    *out++ = 0x80 | (char)(0x003F & ucs4);
+                  }
+                else
+                  {
+                    NS_ERROR("got a High Surrogate but no low surrogate");
+                    // output nothing.
+                  }
+              }
+            else // U+DC00 - U+DFFF
+              {
+                // DC00- DFFF - Low Surrogate
+                NS_ERROR("got a low Surrogate but no high surrogate");
+                // output nothing.
+              }
+          }
+
+        mBuffer = out;
+        return N;
+      }
+
+    void write_terminator()
+      {
+        *mBuffer = buffer_type(0);
+      }
+
+    private:
+      buffer_type* const mStart;
+      buffer_type* mBuffer;
+  };
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8.
+ */
+class CalculateUTF8Size
+  {
+    public:
+      typedef nsAString::char_type value_type;
+
+    CalculateUTF8Size()
+      : mSize(0) { }
+
+    size_t Size() const { return mSize; }
+
+    PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
+      {
+        // Assume UCS2 surrogate pairs won't be spread across fragments.
+        for (const value_type *p = start, *end = start + N; p < end; ++p )
+          {
+            value_type c = *p;
+            if (! (c & 0xFF80)) // U+0000 - U+007F
+              mSize += 1;
+            else if (! (c & 0xF800)) // U+0100 - U+07FF
+              mSize += 2;
+            else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
+              mSize += 3;
+            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+              {
+                ++p;
+                if (p == end)
+                  {
+                    NS_ERROR("Surrogate pair split between fragments");
+                    return N;
+                  }
+                c = *p;
+
+                if (0xDC00 == (0xFC00 & c))
+                  mSize += 4;
+                else
+                  NS_ERROR("got a high Surrogate but no low surrogate");
+              }
+            else // U+DC00 - U+DFFF
+              NS_ERROR("got a low Surrogate but no high surrogate");
+          }
+
+        return N;
+      }
+
+    private:
+      size_t mSize;
+  };
+
 #endif /* !defined(nsUTF8Utils_h_) */
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@ -245,24 +245,18 @@ NS_COM
 char*
 ToNewUTF8String( const nsAString& aSource )
  {
-    // XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
-    // refactored so that we can use it here without a double-copy.
-    NS_ConvertUCS2toUTF8 temp(aSource);
+    nsAString::const_iterator start, end;
+    CalculateUTF8Size calculator;
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                calculator);

-    char* result;
-    if (temp.GetOwnsBuffer()) {
-      // We allocated. Trick the string into not freeing its buffer to
-      // avoid an extra allocation.
-      result = temp.mStr;
+    char *result = NS_STATIC_CAST(char*,
+        nsMemory::Alloc(calculator.Size() + 1));

-      temp.mStr=0;
-      temp.SetOwnsBuffer(PR_FALSE);
-    }
-    else {
-      // We didn't allocate a buffer, so we need to copy it out of the
-      // nsCAutoString's storage.
-      result = ToNewCString(temp);
-    }
+    ConvertUCS2toUTF8 converter(result);
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                converter).write_terminator();
+    NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");

    return result;
  }