Bug 1498320 - Implement ScriptSource::appendSubstring for UTF-8 source text, using a newly-implemented StringBuffer::append(const Utf8Unit* units, size_t len). r=tcampbell

--HG-- extra : rebase_source : 8fbb71a4ca8c424c33af470fc0ff77760f33542e
2024-10-21 09:15:35 +00:00 · 2018-11-01 17:34:56 -07:00 · 2018-11-01 17:34:56 -07:00 · 3452c23147
commit 3452c23147
parent 804bbcfed2
4 changed files with 106 additions and 6 deletions
--- a/js/public/CharacterEncoding.h
+++ b/js/public/CharacterEncoding.h
@ -82,6 +82,12 @@ class UTF8Chars : public mozilla::Range<unsigned char>
    UTF8Chars(const char* aBytes, size_t aLength)
      : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
    {}
+    UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
+      : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength)
+    {}
+    UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
+      : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength)
+    {}
 };

 /*
@ -108,6 +114,10 @@ class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
        MOZ_ASSERT(aBytes[aLength] == '\0');
    }

+    UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
+      : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength)
+    {}
+
    using Base::operator=;

    char* c_str() { return reinterpret_cast<char*>(get()); }
--- a/js/src/util/StringBuffer.h
+++ b/js/src/util/StringBuffer.h
@ -9,6 +9,7 @@

 #include "mozilla/DebugOnly.h"
 #include "mozilla/MaybeOneOf.h"
+#include "mozilla/Utf8.h"

 #include "js/Vector.h"
 #include "vm/JSContext.h"
@ -159,6 +160,14 @@ class StringBuffer
        return append(chars, chars + len);
    }

+    /**
+     * Interpret the provided count of UTF-8 code units as UTF-8, and append
+     * the represented code points to this.  If the code units contain invalid
+     * UTF-8, leave the internal buffer in a consistent but unspecified state,
+     * report an error, and return false.
+     */
+    MOZ_MUST_USE bool append(const mozilla::Utf8Unit* units, size_t len);
+
    MOZ_MUST_USE bool append(const JS::ConstCharPtr chars, size_t len) {
        return append(chars.get(), chars.get() + len);
    }
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@ -8,13 +8,19 @@

 #include "mozilla/Range.h"
 #include "mozilla/Sprintf.h"
+#include "mozilla/TextUtils.h"
+#include "mozilla/Utf8.h"

 #include <algorithm>
 #include <type_traits>

+#include "util/StringBuffer.h"
 #include "util/Unicode.h" // unicode::REPLACEMENT_CHARACTER
 #include "vm/JSContext.h"

+using mozilla::IsAscii;
+using mozilla::Utf8Unit;
+
 using namespace js;

 Latin1CharsZ
@ -607,3 +613,68 @@ JS::StringIsASCII(const char* s)
    }
    return true;
 }
+
+bool
+StringBuffer::append(const Utf8Unit* units, size_t len)
+{
+    if (isLatin1()) {
+        Latin1CharBuffer& latin1 = latin1Chars();
+
+        while (len > 0) {
+            if (!IsAscii(*units)) {
+                break;
+            }
+
+            if (!latin1.append(units->toUnsignedChar())) {
+                return false;
+            }
+
+            ++units;
+            --len;
+        }
+        if (len == 0) {
+            return true;
+        }
+
+        // Non-ASCII doesn't *necessarily* mean we couldn't keep appending to
+        // |latin1|, but it's only possible for [U+0080, U+0100) code points,
+        // and handling the full complexity of UTF-8 only for that very small
+        // additional range isn't worth it.  Inflate to two-byte storage before
+        // appending the remaining code points.
+        if (!inflateChars()) {
+            return false;
+        }
+    }
+
+    UTF8Chars remainingUtf8(units, len);
+
+    // Determine how many UTF-16 code units are required to represent the
+    // remaining units.
+    size_t utf16Len = 0;
+    auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition {
+        utf16Len++;
+        return LoopDisposition::Continue;
+    };
+    if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, remainingUtf8, countInflated)) {
+        return false;
+    }
+
+    TwoByteCharBuffer& buf = twoByteChars();
+
+    size_t i = buf.length();
+    if (!buf.growByUninitialized(utf16Len)) {
+        return false;
+    }
+    MOZ_ASSERT(i + utf16Len == buf.length(),
+               "growByUninitialized assumed to increase length immediately");
+
+    char16_t* toFill = &buf[i];
+    auto appendUtf16 = [&toFill](char16_t unit) {
+        *toFill++ = unit;
+        return LoopDisposition::Continue;
+    };
+
+    MOZ_ALWAYS_TRUE(InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, remainingUtf8, appendUtf16));
+    MOZ_ASSERT(toFill == buf.end());
+    return true;
+}
--- a/js/src/vm/JSScript.cpp
+++ b/js/src/vm/JSScript.cpp
@ -1832,17 +1832,27 @@ ScriptSource::appendSubstring(JSContext* cx, StringBuffer& buf, size_t start, si
    UncompressedSourceCache::AutoHoldEntry holder;

    if (hasSourceType<Utf8Unit>()) {
-        MOZ_CRASH("for now");
-        return false;
-    } else {
-        PinnedUnits<char16_t> units(cx, this, holder, start, len);
-        if (!units.asChars()) {
+        PinnedUnits<Utf8Unit> pinned(cx, this, holder, start, len);
+        if (!pinned.get()) {
            return false;
        }
        if (len > SourceDeflateLimit && !buf.ensureTwoByteChars()) {
            return false;
        }
-        return buf.append(units.asChars(), len);
+
+        const Utf8Unit* units = pinned.get();
+        return buf.append(units, len);
+    } else {
+        PinnedUnits<char16_t> pinned(cx, this, holder, start, len);
+        if (!pinned.get()) {
+            return false;
+        }
+        if (len > SourceDeflateLimit && !buf.ensureTwoByteChars()) {
+            return false;
+        }
+
+        const char16_t* units = pinned.get();
+        return buf.append(units, len);
    }
 }