Bug 1561573 - Avoid linearization and inflation to UTF-16 of the string input to TextEncoder. r=jandem,bzbarsky

Differential Revision: https://phabricator.services.mozilla.com/D44121

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Henri Sivonen 2019-09-18 08:26:52 +00:00
parent 6c949a612d
commit 75e563c076
5 changed files with 220 additions and 30 deletions

View File

@ -13,33 +13,39 @@ namespace mozilla {
namespace dom {
void TextEncoder::Encode(JSContext* aCx, JS::Handle<JSObject*> aObj,
const nsAString& aString,
JS::Handle<JSString*> aString,
JS::MutableHandle<JSObject*> aRetval,
ErrorResult& aRv) {
// Given nsTSubstring<char16_t>::kMaxCapacity, it should be
// impossible for the length computation to overflow, but
// let's use checked math in case someone changes something
// in the future.
CheckedInt<size_t> bufLen(JS::GetStringLength(aString));
bufLen *= 3; // from the contract for JS_EncodeStringToUTF8BufferPartial
// Uint8Array::Create takes uint32_t as the length.
CheckedInt<uint32_t> bufLen(aString.Length());
bufLen *= 3; // from the contract for ConvertUTF16toUTF8
if (!bufLen.isValid()) {
if (!bufLen.isValid() || bufLen.value() > UINT32_MAX) {
aRv.Throw(NS_ERROR_OUT_OF_MEMORY);
return;
}
// TODO: Avoid malloc and use a stack-allocated buffer if bufLen
// is small.
auto data = mozilla::MakeUniqueFallible<uint8_t[]>(bufLen.value());
if (!data) {
aRv.Throw(NS_ERROR_OUT_OF_MEMORY);
return;
}
size_t utf8Len = ConvertUtf16toUtf8(
aString, MakeSpan(reinterpret_cast<char*>(data.get()), bufLen.value()));
MOZ_ASSERT(utf8Len <= bufLen.value());
size_t read;
size_t written;
auto maybe = JS_EncodeStringToUTF8BufferPartial(
aCx, aString, AsWritableChars(MakeSpan(data.get(), bufLen.value())));
if (!maybe) {
aRv.Throw(NS_ERROR_OUT_OF_MEMORY);
return;
}
Tie(read, written) = *maybe;
MOZ_ASSERT(written <= bufLen.value());
MOZ_ASSERT(read == JS::GetStringLength(aString));
JSAutoRealm ar(aCx, aObj);
JSObject* outView = Uint8Array::Create(aCx, utf8Len, data.get());
JSObject* outView = Uint8Array::Create(aCx, written, data.get());
if (!outView) {
aRv.Throw(NS_ERROR_OUT_OF_MEMORY);
return;
@ -48,18 +54,26 @@ void TextEncoder::Encode(JSContext* aCx, JS::Handle<JSObject*> aObj,
aRetval.set(outView);
}
void TextEncoder::EncodeInto(const nsAString& aSrc, const Uint8Array& aDst,
TextEncoderEncodeIntoResult& aResult) {
void TextEncoder::EncodeInto(JSContext* aCx, JS::Handle<JSString*> aSrc,
const Uint8Array& aDst,
TextEncoderEncodeIntoResult& aResult,
OOMReporter& aError) {
aDst.ComputeLengthAndData();
size_t read;
size_t written;
Tie(read, written) = ConvertUtf16toUtf8Partial(
aSrc, MakeSpan(reinterpret_cast<char*>(aDst.Data()), aDst.Length()));
auto maybe = JS_EncodeStringToUTF8BufferPartial(
aCx, aSrc, AsWritableChars(MakeSpan(aDst.Data(), aDst.Length())));
if (!maybe) {
aError.ReportOOM();
return;
}
Tie(read, written) = *maybe;
MOZ_ASSERT(written <= aDst.Length());
aResult.mRead.Construct() = read;
aResult.mWritten.Construct() = written;
}
void TextEncoder::GetEncoding(nsAString& aEncoding) {
void TextEncoder::GetEncoding(nsACString& aEncoding) {
aEncoding.AssignLiteral("utf-8");
}

View File

@ -40,7 +40,7 @@ class TextEncoder final : public NonRefcountedDOMObject {
*
* @param aEncoding, current encoding.
*/
void GetEncoding(nsAString& aEncoding);
void GetEncoding(nsACString& aEncoding);
/**
* Encodes incoming utf-16 code units/ DOM string to utf-8.
@ -52,11 +52,12 @@ class TextEncoder final : public NonRefcountedDOMObject {
* the aRetval out param.
*/
void Encode(JSContext* aCx, JS::Handle<JSObject*> aObj,
const nsAString& aString, JS::MutableHandle<JSObject*> aRetval,
ErrorResult& aRv);
JS::Handle<JSString*> aString,
JS::MutableHandle<JSObject*> aRetval, ErrorResult& aRv);
void EncodeInto(const nsAString& aSrc, const Uint8Array& aDst,
TextEncoderEncodeIntoResult& aResult);
void EncodeInto(JSContext* aCx, JS::Handle<JSString*> aSrc,
const Uint8Array& aDst, TextEncoderEncodeIntoResult& aResult,
OOMReporter& aError);
};
} // namespace dom

View File

@ -0,0 +1,167 @@
var concat = [
{
head: "a",
tail: "b",
expected: "ab",
name: "Latin1 and Latin1",
},
{
head: "α",
tail: "β",
expected: "αβ",
name: "UTF-16 and UTF-16",
},
{
head: "a",
tail: "β",
expected: "aβ",
name: "Latin1 and UTF-16",
},
{
head: "α",
tail: "b",
expected: "αb",
name: "UTF-16 and Latin1",
},
{
head: "\uD83D",
tail: "\uDE03",
expected: "\uD83D\uDE03",
name: "Surrogate pair",
},
{
head: "a\uD83D",
tail: "\uDE03b",
expected: "a\uD83D\uDE03b",
name: "Surrogate pair with prefix and suffix",
},
{
head: "\uD83D",
tail: "b",
expected: "\uFFFDb",
name: "Unpaired high surrogate and Latin1",
},
{
head: "a\uD83D",
tail: "b",
expected: "a\uFFFDb",
name: "Prefixed unpaired high surrogate and Latin1",
},
{
head: "\uD83D",
tail: "β",
expected: "\uFFFDβ",
name: "Unpaired high surrogate and UTF-16",
},
{
head: "a\uD83D",
tail: "β",
expected: "a\uFFFDβ",
name: "Prefixed unpaired high surrogate and UTF-16",
},
{
head: "\uDE03",
tail: "b",
expected: "\uFFFDb",
name: "Unpaired low surrogate and Latin1",
},
{
head: "a\uDE03",
tail: "b",
expected: "a\uFFFDb",
name: "Prefixed unpaired low surrogate and Latin1",
},
{
head: "\uDE03",
tail: "β",
expected: "\uFFFDβ",
name: "Unpaired low surrogate and UTF-16",
},
{
head: "a\uDE03",
tail: "β",
expected: "a\uFFFDβ",
name: "Prefixed unpaired low surrogate and UTF-16",
},
{
head: "a",
tail: "\uDE03",
expected: "a\uFFFD",
name: "Latin1 and unpaired low surrogate",
},
{
head: "a",
tail: "\uDE03b",
expected: "a\uFFFDb",
name: "Latin1 and suffixed unpaired low surrogate",
},
{
head: "α",
tail: "\uDE03",
expected: "α\uFFFD",
name: "UTF-16 and unpaired low surrogate",
},
{
head: "α",
tail: "\uDE03b",
expected: "α\uFFFDb",
name: "UTF-16 and suffixed unpaired low surrogate",
},
{
head: "a",
tail: "\uD83D",
expected: "a\uFFFD",
name: "Latin1 and unpaired high surrogate",
},
{
head: "a",
tail: "\uD83Db",
expected: "a\uFFFDb",
name: "Latin1 and suffixed unpaired high surrogate",
},
{
head: "α",
tail: "\uD83D",
expected: "α\uFFFD",
name: "UTF-16 and unpaired high surrogate",
},
{
head: "α",
tail: "\uD83Db",
expected: "α\uFFFDb",
name: "UTF-16 and suffixed unpaired high surrogate",
},
];
var testingFunctions = Cu.getJSTestingFunctions();
concat.forEach(function(t) {
test(function() {
assert_true(
testingFunctions.isSameCompartment(testingFunctions.newRope, this),
"Must be in the same compartment"
);
var rope = testingFunctions.newRope(t.head, t.tail);
var encoded = new TextEncoder().encode(rope);
var decoded = new TextDecoder().decode(encoded);
assert_equals(decoded, t.expected, "Must round-trip");
}, t.name);
});
test(function() {
assert_true(
testingFunctions.isSameCompartment(testingFunctions.newRope, this),
"Must be in the same compartment"
);
var ab = testingFunctions.newRope("a", "b");
var abc = testingFunctions.newRope(ab, "c");
var ef = testingFunctions.newRope("e", "f");
var def = testingFunctions.newRope("d", ef);
var abcdef = testingFunctions.newRope(abc, def);
var abcdefab = testingFunctions.newRope(abcdef, ab);
var encoded = new TextEncoder().encode(abcdefab);
var decoded = new TextDecoder().decode(encoded);
assert_equals(decoded, "abcdefab", "Must walk the DAG correctly");
}, "Complex rope DAG");

View File

@ -9,3 +9,4 @@ head = head.js
[test_misc.js]
[test_shift_jis.js]
[test_utf.js]
[test_rope_encode.js]

View File

@ -19,21 +19,28 @@ dictionary TextEncoderEncodeIntoResult {
interface TextEncoder {
constructor();
/*
* This is DOMString in the spec, but the value is always ASCII
* and short. By declaring this as ByteString, we get the same
* end result (storage as inline Latin1 string in SpiderMonkey)
* with fewer conversions.
*/
[Constant]
readonly attribute DOMString encoding;
readonly attribute ByteString encoding;
/*
* This is spec-wise USVString but marking it as
* DOMString to avoid duplicate work. Since the
* UTF-16 to UTF-8 converter performs processing
* that's equivalent to first converting a
* DOMString to a USVString, let's avoid having
* the binding code doing it, too.
* JSString as an optimization. (The SpiderMonkey-provided
* conversion to UTF-8 takes care of replacing lone
* surrogates with the REPLACEMENT CHARACTER, so the
* observable behavior of USVString is matched.)
*/
[NewObject]
Uint8Array encode(optional DOMString input = "");
Uint8Array encode(optional JSString input = "");
/*
* The same comment about USVString as above applies here.
*/
TextEncoderEncodeIntoResult encodeInto(DOMString source, Uint8Array destination);
[CanOOM]
TextEncoderEncodeIntoResult encodeInto(JSString source, Uint8Array destination);
};