Move the core of NS_ConvertUCS2toUTF8 into character sinks in nsUTF8Utils.h, and use them to make ToNewUTF8String faster. Fix bug in surrogate handling in the moved code. Fix null-termination bug in UTF8ToNewUnicode. b=206682 r=jag sr=alecf a=brendan

2025-01-26 14:46:02 +00:00 · 2003-05-22 21:25:43 +00:00 · 2003-05-22 21:25:43 +00:00 · 06133b6d3c
commit 06133b6d3c
parent f6ad24ed15
10 changed files with 424 additions and 258 deletions
--- a/string/obsolete/nsString.cpp
+++ b/string/obsolete/nsString.cpp
@ -45,6 +45,7 @@
 #include "nsString.h"
 #include "nsReadableUtils.h"
 #include "nsDebug.h"
+#include "nsUTF8Utils.h"

 #ifndef nsCharTraits_h___
 #include "nsCharTraits.h"
@ -54,8 +55,10 @@
 #include "prdtoa.h"
 #endif

+#ifdef DEBUG
 static const char* kPossibleNull = "Error: possible unintended null in string";
 static const char* kNullPointerError = "Error: unexpected null ptr";
+#endif
 static const char* kWhitespace="\b\t\r\n ";

 const nsBufferHandle<char>*
@ -1084,111 +1087,47 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P

 //----------------------------------------------------------------------

-NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
  {
-    nsAString::const_iterator start; aString.BeginReading(start);
-    nsAString::const_iterator end;   aString.EndReading(end);
-    
-    while (start != end) {
-      nsReadableFragment<PRUnichar> frag(start.fragment());
-      Append(frag.mStart, frag.mEnd - frag.mStart);
-      start.advance(start.size_forward());
-    }
+    if (!aString)
+      // Leave us as an uninitialized nsCAutoString.
+      return;
+    Init(nsDependentString(aString));
  }

-void
-NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
  {
-    // Handle null string by just leaving us as a brand-new
-    // uninitialized nsCAutoString.
-    if (! aString)
+    if (!aString)
+      // Leave us as an uninitialized nsCAutoString.
      return;
+    Init(Substring(aString, aString + aLength));
+  }

-    // Calculate how many bytes we need
-    const PRUnichar* p;
-    PRInt32 count, utf8len;
-    for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
-      {
-        if (! ((*p) & 0xFF80))
-          utf8len += 1; // 0000 0000 - 0000 007F
-        else if (! ((*p) & 0xF800))
-          utf8len += 2; // 0000 0080 - 0000 07FF
-        else 
-          utf8len += 3; // 0000 0800 - 0000 FFFF
-        // Note: Surrogate pair needs 4 bytes, but in this calcuation
-        // we count it as 6 bytes. It will waste 2 bytes per surrogate pair
+void NS_ConvertUCS2toUTF8::Init( const nsAString& aString )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...
+
+    nsAString::const_iterator start, end;
+    CalculateUTF8Size calculator;
+    copy_string(aString.BeginReading(start), aString.EndReading(end), calculator);
+
+    PRUint32 count = calculator.Size();
+
+    if (count) {
+      // Grow the buffer if we need to.
+      SetLength(count);
+
+      // All ready? Time to convert
+
+      ConvertUCS2toUTF8 converter(mStr);
+      copy_string(aString.BeginReading(start), aString.EndReading(end), converter);
+      mLength = converter.Size();
+      if (mLength != count) {
+        NS_ERROR("Input invalid or incorrect length was calculated");
+        Truncate();
      }
-
-    // Make sure our buffer's big enough, so we don't need to do
-    // multiple allocations.
-    if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
-      SetCapacity(mLength+utf8len+1);
-    // |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
-    //  we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
-
-    char* out = mStr+mLength;
-    PRUint32 ucs4=0;
-
-    for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
-      {
-        if (0 == ucs4)
-          {
-            if (! ((*p) & 0xFF80))
-              {
-                *out++ = (char)*p;
-              } 
-            else if (! ((*p) & 0xF800))
-              {
-                *out++ = 0xC0 | (char)((*p) >> 6);
-                *out++ = 0x80 | (char)(0x003F & (*p));
-              }
-            else
-              {
-                if (0xD800 == (0xFC00 & (*p))) 
-                  {
-                    // D800- DBFF - High Surrogate 
-                    // N = (H- D800) *400 + 10000 + ...
-                    ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
-                  }
-                else if (0xDC00 == (0xFC00 & (*p)))
-                  { 
-                    // DC00- DFFF - Low Surrogate 
-                    // error here. We should hit High Surrogate first
-                    // Do not output any thing in this case
-                  }
-                else
-                  {
-                    *out++ = 0xE0 | (char)((*p) >> 12);
-                    *out++ = 0x80 | (char)(0x003F & (*p >> 6));
-                    *out++ = 0x80 | (char)(0x003F & (*p) );
-                  }
-              }
-          }
-        else
-          {
-            if (0xDC00 == (0xFC00 & (*p)))
-              { 
-                // DC00- DFFF - Low Surrogate 
-                // N += ( L - DC00 )  
-                ucs4 |= (0x03FF & (*p));
-
-                // 0001 0000-001F FFFF
-                *out++ = 0xF0 | (char)(ucs4 >> 18);
-                *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
-                *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
-                *out++ = 0x80 | (char)(0x003F & ucs4) ;
-              }
-            else
-              {
-                // Got a High Surrogate but no low surrogate
-                // output nothing.
-              }
-            ucs4 = 0;
-          }
-      }
-
-    *out = '\0'; // null terminate
-    mLength += utf8len;
+    }
  }

 NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )
--- a/string/obsolete/nsString.h
+++ b/string/obsolete/nsString.h
@ -431,24 +431,15 @@ class NS_COM NS_ConvertUCS2toUTF8
    */
  {
    public:
-      friend NS_COM char* ToNewUTF8String( const nsAString& aSource );
-
-    public:
-      explicit
-      NS_ConvertUCS2toUTF8( const PRUnichar* aString )
+      explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString );
+      NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength );
+      explicit NS_ConvertUCS2toUTF8( const nsAString& aString )
        {
-          Append( aString, ~PRUint32(0) /* MAXINT */);
+          Init(aString);
        }

-      NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
-        {
-          Append( aString, aLength );
-        }
-
-      explicit NS_ConvertUCS2toUTF8( const nsAString& aString );
-
    protected:
-      void Append( const PRUnichar* aString, PRUint32 aLength );
+      void Init( const nsAString& aString );

    private:
        // NOT TO BE IMPLEMENTED
--- a/string/obsolete/nsString2.cpp
+++ b/string/obsolete/nsString2.cpp
@ -54,8 +54,10 @@
 #include "prdtoa.h"
 #endif

+#ifdef DEBUG
 static const char* kPossibleNull = "Error: possible unintended null in string";
 static const char* kNullPointerError = "Error: unexpected null ptr";
+#endif
 static const char* kWhitespace="\b\t\r\n ";

 const nsBufferHandle<PRUnichar>*
--- a/string/public/nsUTF8Utils.h
+++ b/string/public/nsUTF8Utils.h
@ -54,6 +54,10 @@ class UTF8traits
 #define PLANE1_BASE           0x00010000  
 #define UCS2_REPLACEMENT_CHAR 0xfffd     

+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UTF-8 to UCS2 (really UTF-16).
+ */
 class ConvertUTF8toUCS2
  {
    public:
@ -181,12 +185,21 @@ class ConvertUTF8toUCS2
        return p - start;
      }

+    void write_terminator()
+      {
+        *mBuffer = buffer_type(0);
+      }
+
    private:
      buffer_type* mStart;
      buffer_type* mBuffer;
      PRBool mErrorEncountered;
  };

+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the length of a UTF-8 string.
+ */
 class CalculateUTF8Length
  {
    public:
@ -242,4 +255,148 @@ class CalculateUTF8Length
      PRBool mErrorEncountered;
  };

+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UCS2 (really UTF-16) to UTF-8.
+ */
+class ConvertUCS2toUTF8
+  {
+    public:
+      typedef nsAString::char_type  value_type;
+      typedef nsACString::char_type buffer_type;
+
+    // The error handling here is more lenient than that in
+    // |ConvertUTF8toUCS2|, but it's that way for backwards
+    // compatibility.
+
+    ConvertUCS2toUTF8( buffer_type* aBuffer )
+        : mStart(aBuffer), mBuffer(aBuffer) {}
+
+    size_t Size() const { return mBuffer - mStart; }
+
+    PRUint32 write( const value_type* start, PRUint32 N )
+      {
+        for (const value_type *p = start, *end = start + N; p < end; ++p )
+          {
+            value_type c = *p;
+            if (! (c & 0xFF80)) // U+0000 - U+007F
+              {
+                *mBuffer++ = (char)c;
+              } 
+            else if (! (c & 0xF800)) // U+0100 - U+07FF
+              {
+                *mBuffer++ = 0xC0 | (char)(c >> 6);
+                *mBuffer++ = 0x80 | (char)(0x003F & c);
+              }
+            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+              {
+                // D800- DBFF - High Surrogate 
+                // N = (H- D800) *400 + 10000 + ...
+                PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+
+                ++p;
+                if (p == end)
+                  {
+                    NS_ERROR("Surrogate pair split between fragments");
+                    return N;
+                  }
+                c = *p;
+
+                if (0xDC00 == (0xFC00 & c))
+                  { 
+                    // DC00- DFFF - Low Surrogate 
+                    // N += ( L - DC00 )  
+                    ucs4 |= (0x03FF & c);
+
+                    // 0001 0000-001F FFFF
+                    *mBuffer++ = 0xF0 | (char)(ucs4 >> 18);
+                    *mBuffer++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
+                    *mBuffer++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
+                    *mBuffer++ = 0x80 | (char)(0x003F & ucs4) ;
+                  }
+                else
+                  {
+                    NS_ERROR("got a High Surrogate but no low surrogate");
+                    // output nothing.
+                  }
+              }
+            else if (0xDC00 == (0xFC00 & c)) // U+DC00 - U+DFFF
+              { 
+                // DC00- DFFF - Low Surrogate 
+                NS_ERROR("got a low Surrogate but no high surrogate");
+                // output nothing.
+              }
+            else // U+0800 - U+D7FF, U+E000 - U+FFFF
+              {
+                *mBuffer++ = 0xE0 | (char)(c >> 12);
+                *mBuffer++ = 0x80 | (char)(0x003F & (c >> 6));
+                *mBuffer++ = 0x80 | (char)(0x003F & c );
+              }
+          }
+
+        return N;
+      }
+
+    void write_terminator()
+      {
+        *mBuffer = buffer_type(0);
+      }
+
+    private:
+      buffer_type* mStart;
+      buffer_type* mBuffer;
+  };
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8.
+ */
+class CalculateUTF8Size
+  {
+    public:
+      typedef nsAString::char_type value_type;
+
+    CalculateUTF8Size()
+      : mSize(0) { }
+
+    size_t Size() const { return mSize; }
+
+    PRUint32 write( const value_type* start, PRUint32 N )
+      {
+        // Assume UCS2 surrogate pairs won't be spread across fragments.
+        for (const value_type *p = start, *end = start + N; p < end; ++p )
+          {
+            value_type c = *p;
+            if (! (c & 0xFF80)) // U+0000 - U+007F
+              mSize += 1;
+            else if (! (c & 0xF800)) // U+0100 - U+07FF
+              mSize += 2;
+            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+              {
+                ++p;
+                if (p == end)
+                  {
+                    NS_ERROR("Surrogate pair split between fragments");
+                    return N;
+                  }
+                c = *p;
+
+                if (0xDC00 == (0xFC00 & c))
+                  mSize += 4;
+                else
+                  NS_ERROR("got a high Surrogate but no low surrogate");
+              }
+            else if (0xDC00 == (0xFC00 & c)) // U+DC00 - U+DFFF
+              NS_ERROR("got a low Surrogate but no high surrogate");
+            else // U+0800 - U+D7FF, U+E000 - U+FFFF
+              mSize += 3;
+          }
+
+        return N;
+      }
+
+    private:
+      size_t mSize;
+  };
+
 #endif /* !defined(nsUTF8Utils_h_) */
--- a/string/src/nsReadableUtils.cpp
+++ b/string/src/nsReadableUtils.cpp
@ -209,24 +209,18 @@ NS_COM
 char*
 ToNewUTF8String( const nsAString& aSource )
  {
-    // XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
-    // refactored so that we can use it here without a double-copy.
-    NS_ConvertUCS2toUTF8 temp(aSource);
+    nsAString::const_iterator start, end;
+    CalculateUTF8Size calculator;
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                calculator);

-    char* result;
-    if (temp.GetOwnsBuffer()) {
-      // We allocated. Trick the string into not freeing its buffer to
-      // avoid an extra allocation.
-      result = temp.mStr;
+    char *result = NS_STATIC_CAST(char*,
+        nsMemory::Alloc(calculator.Size() + 1));

-      temp.mStr=0;
-      temp.SetOwnsBuffer(PR_FALSE);
-    }
-    else {
-      // We didn't allocate a buffer, so we need to copy it out of the
-      // nsCAutoString's storage.
-      result = ToNewCString(temp);
-    }
+    ConvertUCS2toUTF8 converter(result);
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                converter).write_terminator();
+    NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");

    return result;
  }
@ -285,7 +279,7 @@ UTF8ToNewUnicode( const nsACString& aSource )

    ConvertUTF8toUCS2 converter(result);
    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
-                converter);
+                converter).write_terminator();
    NS_ASSERTION(calculator.Length() == converter.Length(), "length mismatch");

    return result;
--- a/xpcom/string/obsolete/nsString.cpp
+++ b/xpcom/string/obsolete/nsString.cpp
@ -45,6 +45,7 @@
 #include "nsString.h"
 #include "nsReadableUtils.h"
 #include "nsDebug.h"
+#include "nsUTF8Utils.h"

 #ifndef nsCharTraits_h___
 #include "nsCharTraits.h"
@ -54,8 +55,10 @@
 #include "prdtoa.h"
 #endif

+#ifdef DEBUG
 static const char* kPossibleNull = "Error: possible unintended null in string";
 static const char* kNullPointerError = "Error: unexpected null ptr";
+#endif
 static const char* kWhitespace="\b\t\r\n ";

 const nsBufferHandle<char>*
@ -1084,111 +1087,47 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P

 //----------------------------------------------------------------------

-NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
  {
-    nsAString::const_iterator start; aString.BeginReading(start);
-    nsAString::const_iterator end;   aString.EndReading(end);
-    
-    while (start != end) {
-      nsReadableFragment<PRUnichar> frag(start.fragment());
-      Append(frag.mStart, frag.mEnd - frag.mStart);
-      start.advance(start.size_forward());
-    }
+    if (!aString)
+      // Leave us as an uninitialized nsCAutoString.
+      return;
+    Init(nsDependentString(aString));
  }

-void
-NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
+NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
  {
-    // Handle null string by just leaving us as a brand-new
-    // uninitialized nsCAutoString.
-    if (! aString)
+    if (!aString)
+      // Leave us as an uninitialized nsCAutoString.
      return;
+    Init(Substring(aString, aString + aLength));
+  }

-    // Calculate how many bytes we need
-    const PRUnichar* p;
-    PRInt32 count, utf8len;
-    for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
-      {
-        if (! ((*p) & 0xFF80))
-          utf8len += 1; // 0000 0000 - 0000 007F
-        else if (! ((*p) & 0xF800))
-          utf8len += 2; // 0000 0080 - 0000 07FF
-        else 
-          utf8len += 3; // 0000 0800 - 0000 FFFF
-        // Note: Surrogate pair needs 4 bytes, but in this calcuation
-        // we count it as 6 bytes. It will waste 2 bytes per surrogate pair
+void NS_ConvertUCS2toUTF8::Init( const nsAString& aString )
+  {
+    // Compute space required: do this once so we don't incur multiple
+    // allocations. This "optimization" is probably of dubious value...
+
+    nsAString::const_iterator start, end;
+    CalculateUTF8Size calculator;
+    copy_string(aString.BeginReading(start), aString.EndReading(end), calculator);
+
+    PRUint32 count = calculator.Size();
+
+    if (count) {
+      // Grow the buffer if we need to.
+      SetLength(count);
+
+      // All ready? Time to convert
+
+      ConvertUCS2toUTF8 converter(mStr);
+      copy_string(aString.BeginReading(start), aString.EndReading(end), converter);
+      mLength = converter.Size();
+      if (mLength != count) {
+        NS_ERROR("Input invalid or incorrect length was calculated");
+        Truncate();
      }
-
-    // Make sure our buffer's big enough, so we don't need to do
-    // multiple allocations.
-    if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
-      SetCapacity(mLength+utf8len+1);
-    // |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
-    //  we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
-
-    char* out = mStr+mLength;
-    PRUint32 ucs4=0;
-
-    for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
-      {
-        if (0 == ucs4)
-          {
-            if (! ((*p) & 0xFF80))
-              {
-                *out++ = (char)*p;
-              } 
-            else if (! ((*p) & 0xF800))
-              {
-                *out++ = 0xC0 | (char)((*p) >> 6);
-                *out++ = 0x80 | (char)(0x003F & (*p));
-              }
-            else
-              {
-                if (0xD800 == (0xFC00 & (*p))) 
-                  {
-                    // D800- DBFF - High Surrogate 
-                    // N = (H- D800) *400 + 10000 + ...
-                    ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
-                  }
-                else if (0xDC00 == (0xFC00 & (*p)))
-                  { 
-                    // DC00- DFFF - Low Surrogate 
-                    // error here. We should hit High Surrogate first
-                    // Do not output any thing in this case
-                  }
-                else
-                  {
-                    *out++ = 0xE0 | (char)((*p) >> 12);
-                    *out++ = 0x80 | (char)(0x003F & (*p >> 6));
-                    *out++ = 0x80 | (char)(0x003F & (*p) );
-                  }
-              }
-          }
-        else
-          {
-            if (0xDC00 == (0xFC00 & (*p)))
-              { 
-                // DC00- DFFF - Low Surrogate 
-                // N += ( L - DC00 )  
-                ucs4 |= (0x03FF & (*p));
-
-                // 0001 0000-001F FFFF
-                *out++ = 0xF0 | (char)(ucs4 >> 18);
-                *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
-                *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
-                *out++ = 0x80 | (char)(0x003F & ucs4) ;
-              }
-            else
-              {
-                // Got a High Surrogate but no low surrogate
-                // output nothing.
-              }
-            ucs4 = 0;
-          }
-      }
-
-    *out = '\0'; // null terminate
-    mLength += utf8len;
+    }
  }

 NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )
--- a/xpcom/string/obsolete/nsString.h
+++ b/xpcom/string/obsolete/nsString.h
@ -431,24 +431,15 @@ class NS_COM NS_ConvertUCS2toUTF8
    */
  {
    public:
-      friend NS_COM char* ToNewUTF8String( const nsAString& aSource );
-
-    public:
-      explicit
-      NS_ConvertUCS2toUTF8( const PRUnichar* aString )
+      explicit NS_ConvertUCS2toUTF8( const PRUnichar* aString );
+      NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength );
+      explicit NS_ConvertUCS2toUTF8( const nsAString& aString )
        {
-          Append( aString, ~PRUint32(0) /* MAXINT */);
+          Init(aString);
        }

-      NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
-        {
-          Append( aString, aLength );
-        }
-
-      explicit NS_ConvertUCS2toUTF8( const nsAString& aString );
-
    protected:
-      void Append( const PRUnichar* aString, PRUint32 aLength );
+      void Init( const nsAString& aString );

    private:
        // NOT TO BE IMPLEMENTED
--- a/xpcom/string/obsolete/nsString2.cpp
+++ b/xpcom/string/obsolete/nsString2.cpp
@ -54,8 +54,10 @@
 #include "prdtoa.h"
 #endif

+#ifdef DEBUG
 static const char* kPossibleNull = "Error: possible unintended null in string";
 static const char* kNullPointerError = "Error: unexpected null ptr";
+#endif
 static const char* kWhitespace="\b\t\r\n ";

 const nsBufferHandle<PRUnichar>*
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@ -54,6 +54,10 @@ class UTF8traits
 #define PLANE1_BASE           0x00010000  
 #define UCS2_REPLACEMENT_CHAR 0xfffd     

+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UTF-8 to UCS2 (really UTF-16).
+ */
 class ConvertUTF8toUCS2
  {
    public:
@ -181,12 +185,21 @@ class ConvertUTF8toUCS2
        return p - start;
      }

+    void write_terminator()
+      {
+        *mBuffer = buffer_type(0);
+      }
+
    private:
      buffer_type* mStart;
      buffer_type* mBuffer;
      PRBool mErrorEncountered;
  };

+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the length of a UTF-8 string.
+ */
 class CalculateUTF8Length
  {
    public:
@ -242,4 +255,148 @@ class CalculateUTF8Length
      PRBool mErrorEncountered;
  };

+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UCS2 (really UTF-16) to UTF-8.
+ */
+class ConvertUCS2toUTF8
+  {
+    public:
+      typedef nsAString::char_type  value_type;
+      typedef nsACString::char_type buffer_type;
+
+    // The error handling here is more lenient than that in
+    // |ConvertUTF8toUCS2|, but it's that way for backwards
+    // compatibility.
+
+    ConvertUCS2toUTF8( buffer_type* aBuffer )
+        : mStart(aBuffer), mBuffer(aBuffer) {}
+
+    size_t Size() const { return mBuffer - mStart; }
+
+    PRUint32 write( const value_type* start, PRUint32 N )
+      {
+        for (const value_type *p = start, *end = start + N; p < end; ++p )
+          {
+            value_type c = *p;
+            if (! (c & 0xFF80)) // U+0000 - U+007F
+              {
+                *mBuffer++ = (char)c;
+              } 
+            else if (! (c & 0xF800)) // U+0100 - U+07FF
+              {
+                *mBuffer++ = 0xC0 | (char)(c >> 6);
+                *mBuffer++ = 0x80 | (char)(0x003F & c);
+              }
+            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+              {
+                // D800- DBFF - High Surrogate 
+                // N = (H- D800) *400 + 10000 + ...
+                PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+
+                ++p;
+                if (p == end)
+                  {
+                    NS_ERROR("Surrogate pair split between fragments");
+                    return N;
+                  }
+                c = *p;
+
+                if (0xDC00 == (0xFC00 & c))
+                  { 
+                    // DC00- DFFF - Low Surrogate 
+                    // N += ( L - DC00 )  
+                    ucs4 |= (0x03FF & c);
+
+                    // 0001 0000-001F FFFF
+                    *mBuffer++ = 0xF0 | (char)(ucs4 >> 18);
+                    *mBuffer++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
+                    *mBuffer++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
+                    *mBuffer++ = 0x80 | (char)(0x003F & ucs4) ;
+                  }
+                else
+                  {
+                    NS_ERROR("got a High Surrogate but no low surrogate");
+                    // output nothing.
+                  }
+              }
+            else if (0xDC00 == (0xFC00 & c)) // U+DC00 - U+DFFF
+              { 
+                // DC00- DFFF - Low Surrogate 
+                NS_ERROR("got a low Surrogate but no high surrogate");
+                // output nothing.
+              }
+            else // U+0800 - U+D7FF, U+E000 - U+FFFF
+              {
+                *mBuffer++ = 0xE0 | (char)(c >> 12);
+                *mBuffer++ = 0x80 | (char)(0x003F & (c >> 6));
+                *mBuffer++ = 0x80 | (char)(0x003F & c );
+              }
+          }
+
+        return N;
+      }
+
+    void write_terminator()
+      {
+        *mBuffer = buffer_type(0);
+      }
+
+    private:
+      buffer_type* mStart;
+      buffer_type* mBuffer;
+  };
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the number of bytes a UCS2 (really UTF-16) would occupy in UTF-8.
+ */
+class CalculateUTF8Size
+  {
+    public:
+      typedef nsAString::char_type value_type;
+
+    CalculateUTF8Size()
+      : mSize(0) { }
+
+    size_t Size() const { return mSize; }
+
+    PRUint32 write( const value_type* start, PRUint32 N )
+      {
+        // Assume UCS2 surrogate pairs won't be spread across fragments.
+        for (const value_type *p = start, *end = start + N; p < end; ++p )
+          {
+            value_type c = *p;
+            if (! (c & 0xFF80)) // U+0000 - U+007F
+              mSize += 1;
+            else if (! (c & 0xF800)) // U+0100 - U+07FF
+              mSize += 2;
+            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+              {
+                ++p;
+                if (p == end)
+                  {
+                    NS_ERROR("Surrogate pair split between fragments");
+                    return N;
+                  }
+                c = *p;
+
+                if (0xDC00 == (0xFC00 & c))
+                  mSize += 4;
+                else
+                  NS_ERROR("got a high Surrogate but no low surrogate");
+              }
+            else if (0xDC00 == (0xFC00 & c)) // U+DC00 - U+DFFF
+              NS_ERROR("got a low Surrogate but no high surrogate");
+            else // U+0800 - U+D7FF, U+E000 - U+FFFF
+              mSize += 3;
+          }
+
+        return N;
+      }
+
+    private:
+      size_t mSize;
+  };
+
 #endif /* !defined(nsUTF8Utils_h_) */
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@ -209,24 +209,18 @@ NS_COM
 char*
 ToNewUTF8String( const nsAString& aSource )
  {
-    // XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
-    // refactored so that we can use it here without a double-copy.
-    NS_ConvertUCS2toUTF8 temp(aSource);
+    nsAString::const_iterator start, end;
+    CalculateUTF8Size calculator;
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                calculator);

-    char* result;
-    if (temp.GetOwnsBuffer()) {
-      // We allocated. Trick the string into not freeing its buffer to
-      // avoid an extra allocation.
-      result = temp.mStr;
+    char *result = NS_STATIC_CAST(char*,
+        nsMemory::Alloc(calculator.Size() + 1));

-      temp.mStr=0;
-      temp.SetOwnsBuffer(PR_FALSE);
-    }
-    else {
-      // We didn't allocate a buffer, so we need to copy it out of the
-      // nsCAutoString's storage.
-      result = ToNewCString(temp);
-    }
+    ConvertUCS2toUTF8 converter(result);
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                converter).write_terminator();
+    NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");

    return result;
  }
@ -285,7 +279,7 @@ UTF8ToNewUnicode( const nsACString& aSource )

    ConvertUTF8toUCS2 converter(result);
    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
-                converter);
+                converter).write_terminator();
    NS_ASSERTION(calculator.Length() == converter.Length(), "length mismatch");

    return result;