Bug 1498320 - Implement ScriptSource::appendSubstring for UTF-8 source text, using a newly-implemented StringBuffer::append(const Utf8Unit* units, size_t len). r=tcampbell

--HG--
extra : rebase_source : 8fbb71a4ca8c424c33af470fc0ff77760f33542e
This commit is contained in:
Jeff Walden 2018-11-01 17:34:56 -07:00
parent 804bbcfed2
commit 3452c23147
4 changed files with 106 additions and 6 deletions

View File

@ -82,6 +82,12 @@ class UTF8Chars : public mozilla::Range<unsigned char>
UTF8Chars(const char* aBytes, size_t aLength)
: Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
{}
UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
: UTF8Chars(reinterpret_cast<char*>(aUnits), aLength)
{}
UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
: UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength)
{}
};
/*
@ -108,6 +114,10 @@ class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
MOZ_ASSERT(aBytes[aLength] == '\0');
}
UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
: UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength)
{}
using Base::operator=;
char* c_str() { return reinterpret_cast<char*>(get()); }

View File

@ -9,6 +9,7 @@
#include "mozilla/DebugOnly.h"
#include "mozilla/MaybeOneOf.h"
#include "mozilla/Utf8.h"
#include "js/Vector.h"
#include "vm/JSContext.h"
@ -159,6 +160,14 @@ class StringBuffer
return append(chars, chars + len);
}
/**
* Interpret the provided count of UTF-8 code units as UTF-8, and append
* the represented code points to this. If the code units contain invalid
* UTF-8, leave the internal buffer in a consistent but unspecified state,
* report an error, and return false.
*/
MOZ_MUST_USE bool append(const mozilla::Utf8Unit* units, size_t len);
MOZ_MUST_USE bool append(const JS::ConstCharPtr chars, size_t len) {
return append(chars.get(), chars.get() + len);
}

View File

@ -8,13 +8,19 @@
#include "mozilla/Range.h"
#include "mozilla/Sprintf.h"
#include "mozilla/TextUtils.h"
#include "mozilla/Utf8.h"
#include <algorithm>
#include <type_traits>
#include "util/StringBuffer.h"
#include "util/Unicode.h" // unicode::REPLACEMENT_CHARACTER
#include "vm/JSContext.h"
using mozilla::IsAscii;
using mozilla::Utf8Unit;
using namespace js;
Latin1CharsZ
@ -607,3 +613,68 @@ JS::StringIsASCII(const char* s)
}
return true;
}
bool
StringBuffer::append(const Utf8Unit* units, size_t len)
{
if (isLatin1()) {
Latin1CharBuffer& latin1 = latin1Chars();
while (len > 0) {
if (!IsAscii(*units)) {
break;
}
if (!latin1.append(units->toUnsignedChar())) {
return false;
}
++units;
--len;
}
if (len == 0) {
return true;
}
// Non-ASCII doesn't *necessarily* mean we couldn't keep appending to
// |latin1|, but it's only possible for [U+0080, U+0100) code points,
// and handling the full complexity of UTF-8 only for that very small
// additional range isn't worth it. Inflate to two-byte storage before
// appending the remaining code points.
if (!inflateChars()) {
return false;
}
}
UTF8Chars remainingUtf8(units, len);
// Determine how many UTF-16 code units are required to represent the
// remaining units.
size_t utf16Len = 0;
auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition {
utf16Len++;
return LoopDisposition::Continue;
};
if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, remainingUtf8, countInflated)) {
return false;
}
TwoByteCharBuffer& buf = twoByteChars();
size_t i = buf.length();
if (!buf.growByUninitialized(utf16Len)) {
return false;
}
MOZ_ASSERT(i + utf16Len == buf.length(),
"growByUninitialized assumed to increase length immediately");
char16_t* toFill = &buf[i];
auto appendUtf16 = [&toFill](char16_t unit) {
*toFill++ = unit;
return LoopDisposition::Continue;
};
MOZ_ALWAYS_TRUE(InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, remainingUtf8, appendUtf16));
MOZ_ASSERT(toFill == buf.end());
return true;
}

View File

@ -1832,17 +1832,27 @@ ScriptSource::appendSubstring(JSContext* cx, StringBuffer& buf, size_t start, si
UncompressedSourceCache::AutoHoldEntry holder;
if (hasSourceType<Utf8Unit>()) {
MOZ_CRASH("for now");
return false;
} else {
PinnedUnits<char16_t> units(cx, this, holder, start, len);
if (!units.asChars()) {
PinnedUnits<Utf8Unit> pinned(cx, this, holder, start, len);
if (!pinned.get()) {
return false;
}
if (len > SourceDeflateLimit && !buf.ensureTwoByteChars()) {
return false;
}
return buf.append(units.asChars(), len);
const Utf8Unit* units = pinned.get();
return buf.append(units, len);
} else {
PinnedUnits<char16_t> pinned(cx, this, holder, start, len);
if (!pinned.get()) {
return false;
}
if (len > SourceDeflateLimit && !buf.ensureTwoByteChars()) {
return false;
}
const char16_t* units = pinned.get();
return buf.append(units, len);
}
}