Bug 1561573 - Avoid linearization and inflation to UTF-16 of the string input to TextEncoder. r=jandem,bzbarsky

Differential Revision: https://phabricator.services.mozilla.com/D44121 --HG-- extra : moz-landing-system : lando
2024-10-10 03:45:46 +00:00 · 2019-09-18 08:26:52 +00:00 · 2019-09-18 08:26:52 +00:00 · 75e563c076
commit 75e563c076
parent 6c949a612d
5 changed files with 220 additions and 30 deletions
--- a/dom/encoding/TextEncoder.cpp
+++ b/dom/encoding/TextEncoder.cpp
@ -13,33 +13,39 @@ namespace mozilla {
 namespace dom {

 void TextEncoder::Encode(JSContext* aCx, JS::Handle<JSObject*> aObj,
-                         const nsAString& aString,
+                         JS::Handle<JSString*> aString,
                         JS::MutableHandle<JSObject*> aRetval,
                         ErrorResult& aRv) {
-  // Given nsTSubstring<char16_t>::kMaxCapacity, it should be
-  // impossible for the length computation to overflow, but
-  // let's use checked math in case someone changes something
-  // in the future.
+  CheckedInt<size_t> bufLen(JS::GetStringLength(aString));
+  bufLen *= 3;  // from the contract for JS_EncodeStringToUTF8BufferPartial
  // Uint8Array::Create takes uint32_t as the length.
-  CheckedInt<uint32_t> bufLen(aString.Length());
-  bufLen *= 3;  // from the contract for ConvertUTF16toUTF8
-  if (!bufLen.isValid()) {
+  if (!bufLen.isValid() || bufLen.value() > UINT32_MAX) {
    aRv.Throw(NS_ERROR_OUT_OF_MEMORY);
    return;
  }

+  // TODO: Avoid malloc and use a stack-allocated buffer if bufLen
+  // is small.
  auto data = mozilla::MakeUniqueFallible<uint8_t[]>(bufLen.value());
  if (!data) {
    aRv.Throw(NS_ERROR_OUT_OF_MEMORY);
    return;
  }

-  size_t utf8Len = ConvertUtf16toUtf8(
-      aString, MakeSpan(reinterpret_cast<char*>(data.get()), bufLen.value()));
-  MOZ_ASSERT(utf8Len <= bufLen.value());
+  size_t read;
+  size_t written;
+  auto maybe = JS_EncodeStringToUTF8BufferPartial(
+      aCx, aString, AsWritableChars(MakeSpan(data.get(), bufLen.value())));
+  if (!maybe) {
+    aRv.Throw(NS_ERROR_OUT_OF_MEMORY);
+    return;
+  }
+  Tie(read, written) = *maybe;
+  MOZ_ASSERT(written <= bufLen.value());
+  MOZ_ASSERT(read == JS::GetStringLength(aString));

  JSAutoRealm ar(aCx, aObj);
-  JSObject* outView = Uint8Array::Create(aCx, utf8Len, data.get());
+  JSObject* outView = Uint8Array::Create(aCx, written, data.get());
  if (!outView) {
    aRv.Throw(NS_ERROR_OUT_OF_MEMORY);
    return;
@ -48,18 +54,26 @@ void TextEncoder::Encode(JSContext* aCx, JS::Handle<JSObject*> aObj,
  aRetval.set(outView);
 }

-void TextEncoder::EncodeInto(const nsAString& aSrc, const Uint8Array& aDst,
-                             TextEncoderEncodeIntoResult& aResult) {
+void TextEncoder::EncodeInto(JSContext* aCx, JS::Handle<JSString*> aSrc,
+                             const Uint8Array& aDst,
+                             TextEncoderEncodeIntoResult& aResult,
+                             OOMReporter& aError) {
  aDst.ComputeLengthAndData();
  size_t read;
  size_t written;
-  Tie(read, written) = ConvertUtf16toUtf8Partial(
-      aSrc, MakeSpan(reinterpret_cast<char*>(aDst.Data()), aDst.Length()));
+  auto maybe = JS_EncodeStringToUTF8BufferPartial(
+      aCx, aSrc, AsWritableChars(MakeSpan(aDst.Data(), aDst.Length())));
+  if (!maybe) {
+    aError.ReportOOM();
+    return;
+  }
+  Tie(read, written) = *maybe;
+  MOZ_ASSERT(written <= aDst.Length());
  aResult.mRead.Construct() = read;
  aResult.mWritten.Construct() = written;
 }

-void TextEncoder::GetEncoding(nsAString& aEncoding) {
+void TextEncoder::GetEncoding(nsACString& aEncoding) {
  aEncoding.AssignLiteral("utf-8");
 }

--- a/dom/encoding/TextEncoder.h
+++ b/dom/encoding/TextEncoder.h
@ -40,7 +40,7 @@ class TextEncoder final : public NonRefcountedDOMObject {
   *
   * @param aEncoding, current encoding.
   */
-  void GetEncoding(nsAString& aEncoding);
+  void GetEncoding(nsACString& aEncoding);

  /**
   * Encodes incoming utf-16 code units/ DOM string to utf-8.
@ -52,11 +52,12 @@ class TextEncoder final : public NonRefcountedDOMObject {
   *                   the aRetval out param.
   */
  void Encode(JSContext* aCx, JS::Handle<JSObject*> aObj,
-              const nsAString& aString, JS::MutableHandle<JSObject*> aRetval,
-              ErrorResult& aRv);
+              JS::Handle<JSString*> aString,
+              JS::MutableHandle<JSObject*> aRetval, ErrorResult& aRv);

-  void EncodeInto(const nsAString& aSrc, const Uint8Array& aDst,
-                  TextEncoderEncodeIntoResult& aResult);
+  void EncodeInto(JSContext* aCx, JS::Handle<JSString*> aSrc,
+                  const Uint8Array& aDst, TextEncoderEncodeIntoResult& aResult,
+                  OOMReporter& aError);
 };

 }  // namespace dom
--- a/dom/encoding/test/unit/test_rope_encode.js
+++ b/dom/encoding/test/unit/test_rope_encode.js
@ -0,0 +1,167 @@
+var concat = [
+  {
+    head: "a",
+    tail: "b",
+    expected: "ab",
+    name: "Latin1 and Latin1",
+  },
+  {
+    head: "α",
+    tail: "β",
+    expected: "αβ",
+    name: "UTF-16 and UTF-16",
+  },
+  {
+    head: "a",
+    tail: "β",
+    expected: "aβ",
+    name: "Latin1 and UTF-16",
+  },
+  {
+    head: "α",
+    tail: "b",
+    expected: "αb",
+    name: "UTF-16 and Latin1",
+  },
+  {
+    head: "\uD83D",
+    tail: "\uDE03",
+    expected: "\uD83D\uDE03",
+    name: "Surrogate pair",
+  },
+  {
+    head: "a\uD83D",
+    tail: "\uDE03b",
+    expected: "a\uD83D\uDE03b",
+    name: "Surrogate pair with prefix and suffix",
+  },
+  {
+    head: "\uD83D",
+    tail: "b",
+    expected: "\uFFFDb",
+    name: "Unpaired high surrogate and Latin1",
+  },
+  {
+    head: "a\uD83D",
+    tail: "b",
+    expected: "a\uFFFDb",
+    name: "Prefixed unpaired high surrogate and Latin1",
+  },
+  {
+    head: "\uD83D",
+    tail: "β",
+    expected: "\uFFFDβ",
+    name: "Unpaired high surrogate and UTF-16",
+  },
+  {
+    head: "a\uD83D",
+    tail: "β",
+    expected: "a\uFFFDβ",
+    name: "Prefixed unpaired high surrogate and UTF-16",
+  },
+
+  {
+    head: "\uDE03",
+    tail: "b",
+    expected: "\uFFFDb",
+    name: "Unpaired low surrogate and Latin1",
+  },
+  {
+    head: "a\uDE03",
+    tail: "b",
+    expected: "a\uFFFDb",
+    name: "Prefixed unpaired low surrogate and Latin1",
+  },
+  {
+    head: "\uDE03",
+    tail: "β",
+    expected: "\uFFFDβ",
+    name: "Unpaired low surrogate and UTF-16",
+  },
+  {
+    head: "a\uDE03",
+    tail: "β",
+    expected: "a\uFFFDβ",
+    name: "Prefixed unpaired low surrogate and UTF-16",
+  },
+
+  {
+    head: "a",
+    tail: "\uDE03",
+    expected: "a\uFFFD",
+    name: "Latin1 and unpaired low surrogate",
+  },
+  {
+    head: "a",
+    tail: "\uDE03b",
+    expected: "a\uFFFDb",
+    name: "Latin1 and suffixed unpaired low surrogate",
+  },
+  {
+    head: "α",
+    tail: "\uDE03",
+    expected: "α\uFFFD",
+    name: "UTF-16 and unpaired low surrogate",
+  },
+  {
+    head: "α",
+    tail: "\uDE03b",
+    expected: "α\uFFFDb",
+    name: "UTF-16 and suffixed unpaired low surrogate",
+  },
+
+  {
+    head: "a",
+    tail: "\uD83D",
+    expected: "a\uFFFD",
+    name: "Latin1 and unpaired high surrogate",
+  },
+  {
+    head: "a",
+    tail: "\uD83Db",
+    expected: "a\uFFFDb",
+    name: "Latin1 and suffixed unpaired high surrogate",
+  },
+  {
+    head: "α",
+    tail: "\uD83D",
+    expected: "α\uFFFD",
+    name: "UTF-16 and unpaired high surrogate",
+  },
+  {
+    head: "α",
+    tail: "\uD83Db",
+    expected: "α\uFFFDb",
+    name: "UTF-16 and suffixed unpaired high surrogate",
+  },
+];
+
+var testingFunctions = Cu.getJSTestingFunctions();
+concat.forEach(function(t) {
+  test(function() {
+    assert_true(
+      testingFunctions.isSameCompartment(testingFunctions.newRope, this),
+      "Must be in the same compartment"
+    );
+    var rope = testingFunctions.newRope(t.head, t.tail);
+    var encoded = new TextEncoder().encode(rope);
+    var decoded = new TextDecoder().decode(encoded);
+    assert_equals(decoded, t.expected, "Must round-trip");
+  }, t.name);
+});
+
+test(function() {
+  assert_true(
+    testingFunctions.isSameCompartment(testingFunctions.newRope, this),
+    "Must be in the same compartment"
+  );
+  var ab = testingFunctions.newRope("a", "b");
+  var abc = testingFunctions.newRope(ab, "c");
+  var ef = testingFunctions.newRope("e", "f");
+  var def = testingFunctions.newRope("d", ef);
+  var abcdef = testingFunctions.newRope(abc, def);
+  var abcdefab = testingFunctions.newRope(abcdef, ab);
+  var encoded = new TextEncoder().encode(abcdefab);
+  var decoded = new TextDecoder().decode(encoded);
+  assert_equals(decoded, "abcdefab", "Must walk the DAG correctly");
+}, "Complex rope DAG");
--- a/dom/encoding/test/unit/xpcshell.ini
+++ b/dom/encoding/test/unit/xpcshell.ini
@ -9,3 +9,4 @@ head = head.js
 [test_misc.js]
 [test_shift_jis.js]
 [test_utf.js]
+[test_rope_encode.js]
--- a/dom/webidl/TextEncoder.webidl
+++ b/dom/webidl/TextEncoder.webidl
@ -19,21 +19,28 @@ dictionary TextEncoderEncodeIntoResult {
 interface TextEncoder {
  constructor();

+  /*
+   * This is DOMString in the spec, but the value is always ASCII
+   * and short. By declaring this as ByteString, we get the same
+   * end result (storage as inline Latin1 string in SpiderMonkey)
+   * with fewer conversions.
+   */
  [Constant]
-  readonly attribute DOMString encoding;
+  readonly attribute ByteString encoding;
+
  /*
   * This is spec-wise USVString but marking it as
-   * DOMString to avoid duplicate work. Since the
-   * UTF-16 to UTF-8 converter performs processing
-   * that's equivalent to first converting a
-   * DOMString to a USVString, let's avoid having
-   * the binding code doing it, too.
+   * JSString as an optimization. (The SpiderMonkey-provided
+   * conversion to UTF-8 takes care of replacing lone
+   * surrogates with the REPLACEMENT CHARACTER, so the
+   * observable behavior of USVString is matched.)
   */
  [NewObject]
-  Uint8Array encode(optional DOMString input = "");
+  Uint8Array encode(optional JSString input = "");

  /*
   * The same comment about USVString as above applies here.
   */
-  TextEncoderEncodeIntoResult encodeInto(DOMString source, Uint8Array destination);
+  [CanOOM]
+  TextEncoderEncodeIntoResult encodeInto(JSString source, Uint8Array destination);
 };