Bug 421576 - Unpaired surrogate handled wrongly (Acid3 #68). acid3++ r=dbaron

2025-03-01 05:48:26 +00:00 · 2008-06-02 21:29:00 -04:00 · 2008-06-02 21:29:00 -04:00 · becc8f7cf3
commit becc8f7cf3
parent b1b8f07670
5 changed files with 298 additions and 96 deletions
--- a/layout/xul/base/src/nsMenuBarFrame.cpp
+++ b/layout/xul/base/src/nsMenuBarFrame.cpp
@ -244,10 +244,9 @@ nsMenuBarFrame::FindMenuWithShortcut(nsIDOMKeyEvent* aKeyEvent)
      current->GetAttr(kNameSpaceID_None, nsGkAtoms::accesskey, shortcutKey);
      if (!shortcutKey.IsEmpty()) {
        ToLowerCase(shortcutKey);
-        nsAutoString::const_iterator start, end;
-        shortcutKey.BeginReading(start);
-        shortcutKey.EndReading(end);
-        PRUint32 ch = UTF16CharEnumerator::NextChar(start, end);
+        const PRUnichar* start = shortcutKey.BeginReading();
+        const PRUnichar* end = shortcutKey.EndReading();
+        PRUint32 ch = UTF16CharEnumerator::NextChar(&start, end);
        PRUint32 index = accessKeys.IndexOf(ch);
        if (index != accessKeys.NoIndex &&
            (foundIndex == kNotFound || index < foundIndex)) {
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@ -335,13 +335,18 @@ public:
          {
            // Found a high surrogate followed by something other than
            // a low surrogate. Flag this as an error and return the
-            // Unicode replacement character 0xFFFD.
-
+            // Unicode replacement character 0xFFFD.  Note that the
+            // pointer to the next character points to the second 16-bit
+            // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
+            // only the first code unit of an illegal sequence must be
+            // treated as an illegally terminated code unit sequence
+            // (also Chapter 3 D91, "isolated [not paired and ill-formed]
+            // UTF-16 code units in the range D800..DFFF are ill-formed").
            NS_WARNING("got a High Surrogate but no low surrogate");

            if (err)
              *err = PR_TRUE;
-            *buffer = p;
+            *buffer = p - 1;
            return 0xFFFD;
          }
      }
@ -364,91 +369,6 @@ public:
      *err = PR_TRUE;
    return 0;
  }
-
-#ifdef MOZILLA_INTERNAL_API
-
-  static PRUint32 NextChar(nsAString::const_iterator& iter,
-                           const nsAString::const_iterator& end,
-                           PRBool *err = nsnull)
-  {
-    if (iter == end)
-      {
-        if (err)
-          *err = PR_TRUE;
-
-        return 0;
-      }
-
-    PRUnichar c = *iter++;
-
-    if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
-      {
-        if (err)
-          *err = PR_FALSE;
-        return c;
-      }
-    else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
-      {
-        if (iter == end)
-          {
-            // Found a high surrogate the end of the buffer. Flag this
-            // as an error and return the Unicode replacement
-            // character 0xFFFD.
-
-            NS_WARNING("Unexpected end of buffer after high surrogate");
-
-            if (err)
-              *err = PR_TRUE;
-            return 0xFFFD;
-          }
-
-        // D800- DBFF - High Surrogate
-        PRUnichar h = c;
-
-        c = *iter++;
-
-        if (NS_IS_LOW_SURROGATE(c))
-          {
-            // DC00- DFFF - Low Surrogate
-            // N = (H - D800) *400 + 10000 + ( L - DC00 )
-            PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
-            if (err)
-              *err = PR_FALSE;
-            return ucs4;
-          }
-        else
-          {
-            // Found a high surrogate followed by something other than
-            // a low surrogate. Flag this as an error and return the
-            // Unicode replacement character 0xFFFD.
-
-            NS_WARNING("got a High Surrogate but no low surrogate");
-
-            if (err)
-              *err = PR_TRUE;
-            return 0xFFFD;
-          }
-      }
-    else // U+DC00 - U+DFFF
-      {
-        // DC00- DFFF - Low Surrogate
-
-        // Found a low surrogate w/o a preceeding high surrogate. Flag
-        // this as an error and return the Unicode replacement
-        // character 0xFFFD.
-
-        NS_WARNING("got a low Surrogate but no high surrogate");
-
-        if (err)
-          *err = PR_TRUE;
-        return 0xFFFD;
-      }
-
-    if (err)
-      *err = PR_TRUE;
-    return 0;
-  }
-#endif // MOZILLA_INTERNAL_API
 };


@ -687,6 +607,15 @@ class ConvertUTF16toUTF8
                    *out++ = 0xBF;
                    *out++ = 0xBD;

+                    // The pointer to the next character points to the second
+                    // 16-bit value, not beyond it, as per Unicode 5.0.0
+                    // Chapter 3 C10, only the first code unit of an illegal
+                    // sequence must be treated as an illegally terminated
+                    // code unit sequence (also Chapter 3 D91, "isolated [not
+                    // paired and ill-formed] UTF-16 code units in the range
+                    // D800..DFFF are ill-formed").
+                    p--;
+
                    NS_WARNING("got a High Surrogate but no low surrogate");
                  }
              }
@ -768,6 +697,15 @@ class CalculateUTF8Size
                    // UTF-8)
                    mSize += 3;

+                    // The next code unit is the second 16-bit value, not
+                    // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
+                    // only the first code unit of an illegal sequence must
+                    // be treated as an illegally terminated code unit
+                    // sequence (also Chapter 3 D91, "isolated [not paired and
+                    // ill-formed] UTF-16 code units in the range D800..DFFF
+                    // are ill-formed").
+                    p--;
+
                    NS_WARNING("got a high Surrogate but no low surrogate");
                  }
              }
--- a/xpcom/tests/Makefile.in
+++ b/xpcom/tests/Makefile.in
@ -88,6 +88,7 @@ CPPSRCS += \
 		TestAtoms.cpp \
 		TestAutoLock.cpp \
 		TestCRT.cpp \
+		TestEncoding.cpp \
 		TestPermanentAtoms.cpp \
 		TestPipes.cpp \
 		TestThreads.cpp \
@ -146,6 +147,7 @@ CPP_UNIT_TESTS += \
  TestArray \
  TestAutoLock \
  TestCRT \
+  TestEncoding \
  TestExpirationTracker \
  TestPipes \
  TestProxies \
--- a/xpcom/tests/TestEncoding.cpp
+++ b/xpcom/tests/TestEncoding.cpp
@ -0,0 +1,232 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Jeff Walden <jwalden+code@mit.edu>.
+ * Portions created by the Initial Developer are Copyright (C) 2008
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "TestHarness.h"
+
+nsresult TestGoodSurrogatePair()
+{
+  // When this string is decoded, the surrogate pair is U+10302 and the rest of
+  // the string is specified by indexes 2 onward.
+  const PRUnichar goodPairData[] = {  0xD800, 0xDF02, 0x65, 0x78, 0x0 };
+  nsDependentString goodPair16(goodPairData);
+
+  PRUint32 byteCount = 0;
+  char* goodPair8 = ToNewUTF8String(goodPair16, &byteCount);
+  if (!goodPair8)
+  {
+    fail("out of memory creating goodPair8");
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  if (byteCount != 6)
+  {
+    fail("wrong number of bytes; expected 6, got %lu", byteCount);
+    return NS_ERROR_FAILURE;
+  }
+
+  const char expected8[] = { 0xF0, 0x90, 0x8C, 0x82, 0x65, 0x78, 0x0 };
+  if (0 != memcmp(expected8, goodPair8, sizeof(expected8)))
+  {
+    fail("wrong translation to UTF8");
+    return NS_ERROR_FAILURE;
+  }
+
+  // This takes a different code path from the above, so test it to make sure
+  // the UTF-16 enumeration remains in sync with the UTF-8 enumeration.
+  nsDependentCString expected(expected8);
+  if (0 != CompareUTF8toUTF16(expected, goodPair16))
+  {
+    fail("bad comparison between UTF-8 and equivalent UTF-16");
+    return NS_ERROR_FAILURE;
+  }
+
+  NS_Free(goodPair8);
+
+  passed("TestGoodSurrogatePair");
+  return NS_OK;
+}
+
+nsresult TestBackwardsSurrogatePair()
+{
+  // When this string is decoded, the two surrogates are wrongly ordered and
+  // must each be interpreted as U+FFFD.
+  const PRUnichar backwardsPairData[] = { 0xDDDD, 0xD863, 0x65, 0x78, 0x0 };
+  nsDependentString backwardsPair16(backwardsPairData);
+
+  PRUint32 byteCount = 0;
+  char* backwardsPair8 = ToNewUTF8String(backwardsPair16, &byteCount);
+  if (!backwardsPair8)
+  {
+    fail("out of memory creating backwardsPair8");
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  if (byteCount != 8)
+  {
+    fail("wrong number of bytes; expected 8, got %lu", byteCount);
+    return NS_ERROR_FAILURE;
+  }
+
+  const char expected8[] =
+    { 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0x65, 0x78, 0x0 };
+  if (0 != memcmp(expected8, backwardsPair8, sizeof(expected8)))
+  {
+    fail("wrong translation to UTF8");
+    return NS_ERROR_FAILURE;
+  }
+
+  // This takes a different code path from the above, so test it to make sure
+  // the UTF-16 enumeration remains in sync with the UTF-8 enumeration.
+  nsDependentCString expected(expected8);
+  if (0 != CompareUTF8toUTF16(expected, backwardsPair16))
+  {
+    fail("bad comparison between UTF-8 and malformed but equivalent UTF-16");
+    return NS_ERROR_FAILURE;
+  }
+
+  NS_Free(backwardsPair8);
+
+  passed("TestBackwardsSurrogatePair");
+  return NS_OK;
+}
+
+nsresult TestMalformedUTF16OrphanHighSurrogate()
+{
+  // When this string is decoded, the high surrogate should be replaced and the
+  // rest of the string is specified by indexes 1 onward.
+  const PRUnichar highSurrogateData[] = { 0xD863, 0x74, 0x65, 0x78, 0x74, 0x0 };
+  nsDependentString highSurrogate16(highSurrogateData);
+
+  PRUint32 byteCount = 0;
+  char* highSurrogate8 = ToNewUTF8String(highSurrogate16, &byteCount);
+  if (!highSurrogate8)
+  {
+    fail("out of memory creating highSurrogate8");
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  if (byteCount != 7)
+  {
+    fail("wrong number of bytes; expected 7, got %lu", byteCount);
+    return NS_ERROR_FAILURE;
+  }
+
+  const char expected8[] = { 0xEF, 0xBF, 0xBD, 0x74, 0x65, 0x78, 0x74, 0x0 };
+  if (0 != memcmp(expected8, highSurrogate8, sizeof(expected8)))
+  {
+    fail("wrong translation to UTF8");
+    return NS_ERROR_FAILURE;
+  }
+
+  // This takes a different code path from the above, so test it to make sure
+  // the UTF-16 enumeration remains in sync with the UTF-8 enumeration.
+  nsDependentCString expected(expected8);
+  if (0 != CompareUTF8toUTF16(expected, highSurrogate16))
+  {
+    fail("bad comparison between UTF-8 and malformed but equivalent UTF-16");
+    return NS_ERROR_FAILURE;
+  }
+
+  NS_Free(highSurrogate8);
+
+  passed("TestMalformedUTF16OrphanHighSurrogate");
+  return NS_OK;
+}
+
+nsresult TestMalformedUTF16OrphanLowSurrogate()
+{
+  // When this string is decoded, the low surrogate should be replaced and the
+  // rest of the string is specified by indexes 1 onward.
+  const PRUnichar lowSurrogateData[] = { 0xDDDD, 0x74, 0x65, 0x78, 0x74, 0x0 };
+  nsDependentString lowSurrogate16(lowSurrogateData);
+
+  PRUint32 byteCount = 0;
+  char* lowSurrogate8 = ToNewUTF8String(lowSurrogate16, &byteCount);
+  if (!lowSurrogate8)
+  {
+    fail("out of memory creating lowSurrogate8");
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  if (byteCount != 7)
+  {
+    fail("wrong number of bytes; expected 7, got %lu", byteCount);
+    return NS_ERROR_FAILURE;
+  }
+
+  const char expected8[] = { 0xEF, 0xBF, 0xBD, 0x74, 0x65, 0x78, 0x74, 0x0 };
+  if (0 != memcmp(expected8, lowSurrogate8, sizeof(expected8)))
+  {
+    fail("wrong translation to UTF8");
+    return NS_ERROR_FAILURE;
+  }
+
+  // This takes a different code path from the above, so test it to make sure
+  // the UTF-16 enumeration remains in sync with the UTF-8 enumeration.
+  nsDependentCString expected(expected8);
+  if (0 != CompareUTF8toUTF16(expected, lowSurrogate16))
+  {
+    fail("bad comparison between UTF-8 and malformed but equivalent UTF-16");
+    return NS_ERROR_FAILURE;
+  }
+
+  NS_Free(lowSurrogate8);
+
+  passed("TestMalformedUTF16OrphanLowSurrogate");
+  return NS_OK;
+}
+
+
+int main(int argc, char** argv)
+{
+  ScopedXPCOM xpcom("TestEncoding");
+  if (xpcom.failed())
+    return 1;
+
+  int rv = 0;
+
+  if (NS_FAILED(TestGoodSurrogatePair()))
+    rv = 1;
+  if (NS_FAILED(TestBackwardsSurrogatePair()))
+    rv = 1;
+  if (NS_FAILED(TestMalformedUTF16OrphanHighSurrogate()))
+    rv = 1;
+  if (NS_FAILED(TestMalformedUTF16OrphanLowSurrogate()))
+    rv = 1;
+
+  return rv;
+}
--- a/xpcom/tests/TestHarness.h
+++ b/xpcom/tests/TestHarness.h
@ -37,18 +37,49 @@

 /*
 * Test harness for XPCOM objects, providing a scoped XPCOM initializer,
- * nsCOMPtr, nsRefPtr, do_CreateInstance, and stdio.h/stdlib.h.
+ * nsCOMPtr, nsRefPtr, do_CreateInstance, do_GetService, ns(Auto|C|)String,
+ * and stdio.h/stdlib.h.
 */

 #ifndef TestHarness_h__
 #define TestHarness_h__

-#include "nsIServiceManager.h"
 #include "nsComponentManagerUtils.h"
+#include "nsServiceManagerUtils.h"
 #include "nsCOMPtr.h"
 #include "nsAutoPtr.h"
+#include "nsStringGlue.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdarg.h>
+
+/**
+ * Prints the given failure message and arguments using printf, prepending
+ * "FAIL " for the benefit of the test harness and appending "\n" to eliminate
+ * having to type it at each call site.
+ */
+void fail(const char* msg, ...)
+{
+  va_list ap;
+
+  printf("FAIL ");
+
+  va_start(ap, msg);
+  vprintf(msg, ap);
+  va_end(ap);
+
+  putchar('\n');
+}
+
+/**
+ * Prints the given string followed by " PASSED!\n", to be used at the end
+ * of a successful test function.
+ */
+void passed(const char* test)
+{
+  printf("%s PASSED!\n", test);
+}
+

 class ScopedXPCOM
 {
@ -62,7 +93,7 @@ class ScopedXPCOM
      nsresult rv = NS_InitXPCOM2(&mServMgr, NULL, dirSvcProvider);
      if (NS_FAILED(rv))
      {
-        printf("FAIL NS_InitXPCOM2 returned failure code %x\n", rv);
+        fail("NS_InitXPCOM2 returned failure code 0x%x", rv);
        mServMgr = NULL;
      }
    }
@ -75,7 +106,7 @@ class ScopedXPCOM
        nsresult rv = NS_ShutdownXPCOM(NULL);
        if (NS_FAILED(rv))
        {
-          printf("FAIL XPCOM shutdown failed with code %x\n", rv);
+          fail("XPCOM shutdown failed with code 0x%x", rv);
          exit(1);
        }
      }