Bug 1478170 - Implement getNonAsciiCodePointDontNormalize for UTF-8. r=arai

--HG-- extra : rebase_source : ab9e28fc001eaab9af3bcb072ce783a88d0f7f07
2024-10-28 20:55:39 +00:00 · 2018-07-18 22:46:47 -07:00 · 2018-07-18 22:46:47 -07:00 · cf12902522
commit cf12902522
parent ab4f689aa3
4 changed files with 411 additions and 0 deletions
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@ -571,6 +571,193 @@ SourceUnits<char16_t>::assertNextCodePoint(const PeekedCodePoint<char16_t>& peek

 #endif // DEBUG

+template<class AnyCharsAccess>
+MOZ_COLD void
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(uint8_t relevantUnits,
+                                                                  unsigned errorNumber, ...)
+{
+    va_list args;
+    va_start(args, errorNumber);
+
+    do {
+        size_t offset = this->sourceUnits.offset();
+
+        ErrorMetadata err;
+
+        TokenStreamAnyChars& anyChars = anyCharsAccess();
+
+        if (bool hasLineOfContext = anyChars.fillExcludingContext(&err, offset)) {
+            if (!internalComputeLineOfContext(&err, offset))
+                break;
+
+            // As this is an encoding error, the computed window-end must be
+            // identical to the location of the error -- any further on and the
+            // window would contain invalid Unicode.
+            MOZ_ASSERT_IF(err.lineOfContext != nullptr,
+                          err.lineLength == err.tokenOffset);
+        }
+
+        auto notes = MakeUnique<JSErrorNotes>();
+        if (!notes) {
+            ReportOutOfMemory(anyChars.cx);
+            break;
+        }
+
+        // The largest encoding of a UTF-8 code point is 4 units.  (Encoding an
+        // obsolete 5- or 6-byte code point will complain only about a bad lead
+        // code unit.)
+        constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
+
+        MOZ_ASSERT(relevantUnits > 0);
+
+        char badUnitsStr[MaxWidth];
+        char* ptr = badUnitsStr;
+        while (relevantUnits > 0) {
+            byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
+            ptr[4] = ' ';
+
+            ptr += 5;
+            relevantUnits--;
+        }
+
+        ptr[-1] = '\0';
+
+        uint32_t line, column;
+        anyChars.srcCoords.lineNumAndColumnIndex(offset, &line, &column);
+
+        if (!notes->addNoteASCII(anyChars.cx, anyChars.getFilename(), line, column,
+                                 GetErrorMessage, nullptr, JSMSG_BAD_CODE_UNITS, badUnitsStr))
+        {
+            break;
+        }
+
+        ReportCompileError(anyChars.cx, std::move(err), std::move(notes), JSREPORT_ERROR,
+                           errorNumber, args);
+    } while (false);
+
+    va_end(args);
+}
+
+template<class AnyCharsAccess>
+MOZ_COLD void
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(Utf8Unit lead)
+{
+    uint8_t leadValue = lead.toUint8();
+
+    char leadByteStr[5];
+    byteToTerminatedString(leadValue, leadByteStr);
+
+    internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
+}
+
+template<class AnyCharsAccess>
+MOZ_COLD void
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(Utf8Unit lead,
+                                                           uint8_t remaining, uint8_t required)
+{
+    uint8_t leadValue = lead.toUint8();
+
+    MOZ_ASSERT(required == 2 || required == 3 || required == 4);
+    MOZ_ASSERT(remaining < 4);
+    MOZ_ASSERT(remaining < required);
+
+    char leadByteStr[5];
+    byteToTerminatedString(leadValue, leadByteStr);
+
+    // |toHexChar| produces the desired decimal numbers for values < 4.
+    const char expectedStr[] = { toHexChar(required - 1), '\0' };
+    const char actualStr[] = { toHexChar(remaining - 1), '\0' };
+
+    internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS,
+                          leadByteStr, expectedStr, actualStr, remaining == 2 ? " was" : "s were");
+}
+
+template<class AnyCharsAccess>
+MOZ_COLD void
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(Utf8Unit badUnit,
+                                                            uint8_t unitsObserved)
+{
+    char badByteStr[5];
+    byteToTerminatedString(badUnit.toUint8(), badByteStr);
+
+    internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT, badByteStr);
+}
+
+template<class AnyCharsAccess>
+MOZ_COLD void
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(uint32_t codePoint,
+                                                                          uint8_t codePointLength,
+                                                                          const char* reason)
+{
+    // Construct a string like "0x203D" (including null terminator) to include
+    // in the error message.  Write the string end-to-start from end to start
+    // of an adequately sized |char| array, shifting least significant nibbles
+    // off the number and writing the corresponding hex digits until done, then
+    // prefixing with "0x".  |codePointStr| points at the incrementally
+    // computed string, within |codePointCharsArray|'s bounds.
+
+    // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
+    // bits in a four-byte UTF-8 code unit sequence.
+    constexpr size_t MaxHexSize = sizeof("0x1F" "FFFF"); // including '\0'
+    char codePointCharsArray[MaxHexSize];
+
+    char* codePointStr = codePointCharsArray + ArrayLength(codePointCharsArray);
+    *--codePointStr = '\0';
+
+    uint32_t copy = codePoint;
+    while (copy) {
+        MOZ_ASSERT(codePointCharsArray < codePointStr);
+        *--codePointStr = toHexChar(copy & 0xF);
+        copy >>= 4;
+    }
+
+    MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
+    *--codePointStr = 'x';
+    *--codePointStr = '0';
+
+    internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT, codePointStr, reason);
+}
+
+template<class AnyCharsAccess>
+MOZ_MUST_USE bool
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(Utf8Unit lead,
+                                                                              char32_t* codePoint)
+{
+    auto onBadLeadUnit = [this, &lead]() {
+        this->badLeadUnit(lead);
+    };
+
+    auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
+        this->notEnoughUnits(lead, remaining, required);
+    };
+
+    auto onBadTrailingUnit = [this, &lead](uint8_t unitsObserved) {
+        this->badTrailingUnit(lead, unitsObserved);
+    };
+
+    auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
+        this->badCodePoint(badCodePoint, unitsObserved);
+    };
+
+    auto onNotShortestForm = [this](char32_t badCodePoint, uint8_t unitsObserved) {
+        this->notShortestForm(badCodePoint, unitsObserved);
+    };
+
+    // If a valid code point is decoded, this function call consumes its code
+    // units.  If not, it ungets the lead code unit and invokes the right error
+    // handler, so on failure we must immediately return false.
+    SourceUnitsIterator iter(this->sourceUnits);
+    Maybe<char32_t> maybeCodePoint =
+        DecodeOneUtf8CodePointInline(lead, &iter, SourceUnitsEnd(),
+                                     onBadLeadUnit, onNotEnoughUnits, onBadTrailingUnit,
+                                     onBadCodePoint, onNotShortestForm);
+    if (maybeCodePoint.isNothing())
+        return false;
+
+    *codePoint = maybeCodePoint.value();
+    return true;
+}
+
 template<class AnyCharsAccess>
 bool
 TokenStreamChars<char16_t, AnyCharsAccess>::getCodePoint(int32_t* cp)
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@ -1163,6 +1163,10 @@ class SourceUnits
        return base_ + (offset - startOffset_);
    }

+    const CharT* current() const {
+        return ptr;
+    }
+
    const CharT* limit() const {
        return limit_;
    }
@ -1649,6 +1653,88 @@ class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
  protected:
    // These APIs are only usable by UTF-8-specific code.

+    using typename CharsBase::SourceUnits;
+
+    /**
+     * A mutable iterator-wrapper around |SourceUnits| that translates
+     * operators to calls to |SourceUnits::getCodeUnit()| and similar.
+     *
+     * This class is expected to be used in concert with |SourceUnitsEnd|.
+     */
+    class SourceUnitsIterator
+    {
+        SourceUnits& sourceUnits_;
+#ifdef DEBUG
+        // In iterator copies created by the post-increment operator, a pointer
+        // at the next source text code unit when the post-increment operator
+        // was called, cleared when the iterator is dereferenced.
+        mutable mozilla::Maybe<const mozilla::Utf8Unit*> currentBeforePostIncrement_;
+#endif
+
+      public:
+        explicit SourceUnitsIterator(SourceUnits& sourceUnits)
+          : sourceUnits_(sourceUnits)
+        {}
+
+        mozilla::Utf8Unit operator*() const {
+            // operator* is expected to get the *next* value from an iterator
+            // not pointing at the end of the underlying range.  However, the
+            // sole use of this is in the context of an expression of the form
+            // |*iter++|, that performed the |sourceUnits_.getCodeUnit()| in
+            // the |operator++(int)| below -- so dereferencing acts on a
+            // |sourceUnits_| already advanced.  Therefore the correct unit to
+            // return is the previous one.
+            MOZ_ASSERT(currentBeforePostIncrement_.value() + 1 == sourceUnits_.current());
+#ifdef DEBUG
+            currentBeforePostIncrement_.reset();
+#endif
+            return sourceUnits_.previousCodeUnit();
+        }
+
+        SourceUnitsIterator operator++(int) {
+            MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
+                       "the only valid operation on a post-incremented "
+                       "iterator is dereferencing a single time");
+
+            SourceUnitsIterator copy = *this;
+#ifdef DEBUG
+            copy.currentBeforePostIncrement_.emplace(sourceUnits_.current());
+#endif
+
+            sourceUnits_.getCodeUnit();
+            return copy;
+        }
+
+        void operator-=(size_t n) {
+            MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
+                       "the only valid operation on a post-incremented "
+                       "iterator is dereferencing a single time");
+            sourceUnits_.unskipCodeUnits(n);
+        }
+
+        mozilla::Utf8Unit operator[](ptrdiff_t index) {
+            MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
+                       "the only valid operation on a post-incremented "
+                       "iterator is dereferencing a single time");
+            MOZ_ASSERT(index == -1,
+                       "must only be called to verify the value of the "
+                       "previous code unit");
+            return sourceUnits_.previousCodeUnit();
+        }
+
+        size_t remaining() const {
+            MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
+                       "the only valid operation on a post-incremented "
+                       "iterator is dereferencing a single time");
+            return sourceUnits_.remaining();
+        }
+    };
+
+    /** A sentinel representing the end of |SourceUnits| data. */
+    class SourceUnitsEnd {};
+
+    friend inline size_t operator-(const SourceUnitsEnd& aEnd, const SourceUnitsIterator& aIter);
+
  protected:
    // These APIs are in both SpecializedTokenStreamCharsBase specializations
    // and so are usable in subclasses no matter what CharT is.
@ -1656,6 +1742,13 @@ class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
    using CharsBase::CharsBase;
 };

+inline size_t
+operator-(const SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>::SourceUnitsEnd& aEnd,
+          const SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>::SourceUnitsIterator& aIter)
+{
+    return aIter.remaining();
+}
+
 /** A small class encapsulating computation of the start-offset of a Token. */
 class TokenStart
 {
@ -1963,11 +2056,122 @@ class TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>
    using GeneralCharsBase = GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
    using Self = TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;

+    using typename SpecializedCharsBase::SourceUnitsEnd;
+    using typename SpecializedCharsBase::SourceUnitsIterator;
+
  protected:
+    using GeneralCharsBase::anyCharsAccess;
+    using GeneralCharsBase::internalComputeLineOfContext;
+    using TokenStreamCharsShared::isAsciiCodePoint;
    // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(

+  private:
+    static char toHexChar(uint8_t nibble) {
+        MOZ_ASSERT(nibble < 16);
+        return "0123456789ABCDEF"[nibble];
+    }
+
+    static void byteToString(uint8_t n, char* str) {
+        str[0] = '0';
+        str[1] = 'x';
+        str[2] = toHexChar(n >> 4);
+        str[3] = toHexChar(n & 0xF);
+    }
+
+    static void byteToTerminatedString(uint8_t n, char* str) {
+        byteToString(n, str);
+        str[4] = '\0';
+    }
+
+    /**
+     * Report a UTF-8 encoding-related error for a code point starting AT THE
+     * CURRENT OFFSET.
+     *
+     * |relevantUnits| indicates how many code units from the current offset
+     * are potentially relevant to the reported error, such that they may be
+     * included in the error message.  For example, if at the current offset we
+     * have
+     *
+     *   0b1111'1111 ...
+     *
+     * a code unit never allowed in UTF-8, then |relevantUnits| might be 1
+     * because only that unit is relevant.  Or if we have
+     *
+     *   0b1111'0111 0b1011'0101 0b0000'0000 ...
+     *
+     * where the first two code units are a valid prefix to a four-unit code
+     * point but the third unit *isn't* a valid trailing code unit, then
+     * |relevantUnits| might be 3.
+     */
+    MOZ_COLD void internalEncodingError(uint8_t relevantUnits, unsigned errorNumber, ...);
+
+    // Don't use |internalEncodingError|!  Use one of the elaborated functions
+    // that calls it, below -- all of which should be used to indicate an error
+    // in a code point starting AT THE CURRENT OFFSET as with
+    // |internalEncodingError|.
+
+    /** Report an error for an invalid lead code unit |lead|. */
+    MOZ_COLD void badLeadUnit(mozilla::Utf8Unit lead);
+
+    /**
+     * Report an error when there aren't enough code units remaining to
+     * constitute a full code point after |lead|: only |remaining| code units
+     * were available for a code point starting with |lead|, when at least
+     * |required| code units were required.
+     */
+    MOZ_COLD void notEnoughUnits(mozilla::Utf8Unit lead, uint8_t remaining, uint8_t required);
+
+    /**
+     * Report an error for a bad trailing UTF-8 code unit, where the bad
+     * trailing unit was the last of |unitsObserved| units examined from the
+     * current offset.
+     */
+    MOZ_COLD void badTrailingUnit(mozilla::Utf8Unit badUnit, uint8_t unitsObserved);
+
+    // Helper used for both |badCodePoint| and |notShortestForm| for code units
+    // that have all the requisite high bits set/unset in a manner that *could*
+    // encode a valid code point, but the remaining bits encoding its actual
+    // value do not define a permitted value.
+    MOZ_COLD void badStructurallyValidCodePoint(uint32_t codePoint, uint8_t codePointLength,
+                                                const char* reason);
+
+    /**
+     * Report an error for UTF-8 that encodes a UTF-16 surrogate or a number
+     * outside the Unicode range.
+     */
+    MOZ_COLD void badCodePoint(uint32_t codePoint, uint8_t codePointLength) {
+        MOZ_ASSERT(unicode::IsSurrogate(codePoint) || codePoint > unicode::NonBMPMax);
+
+        badStructurallyValidCodePoint(codePoint, codePointLength,
+                                      unicode::IsSurrogate(codePoint)
+                                      ? "it's a UTF-16 surrogate"
+                                      : "the maximum code point is U+10FFFF");
+    }
+
+    /**
+     * Report an error for UTF-8 that encodes a code point not in its shortest
+     * form.
+     */
+    MOZ_COLD void notShortestForm(uint32_t codePoint, uint8_t codePointLength) {
+        MOZ_ASSERT(!unicode::IsSurrogate(codePoint));
+        MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
+
+        badStructurallyValidCodePoint(codePoint, codePointLength,
+                                      "it wasn't encoded in shortest possible form");
+    }
+
  protected:
    using GeneralCharsBase::GeneralCharsBase;
+
+    /**
+     * Given the non-ASCII |lead| code unit just consumed, consume the rest of
+     * a non-ASCII code point.  The code point is not normalized: on success
+     * |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
+     *
+     * Report an error if an invalid code point is encountered.
+     */
+    MOZ_MUST_USE bool
+    getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead, char32_t* codePoint);
 };

 // TokenStream is the lexical scanner for JavaScript source text.
--- a/js/src/js.msg
+++ b/js/src/js.msg
@ -352,6 +352,13 @@ MSG_DEF(JSMSG_DEFAULT_IN_PATTERN,      0, JSEXN_SYNTAXERR, "destructuring defaul
 MSG_DEF(JSMSG_BAD_NEWTARGET,           0, JSEXN_SYNTAXERR, "new.target only allowed within functions")
 MSG_DEF(JSMSG_ESCAPED_KEYWORD,         0, JSEXN_SYNTAXERR, "keywords must be written literally, without embedded escapes")

+// UTF-8 source text encoding errors
+MSG_DEF(JSMSG_BAD_LEADING_UTF8_UNIT,   1, JSEXN_SYNTAXERR, "{0} byte doesn't begin a valid UTF-8 code point")
+MSG_DEF(JSMSG_NOT_ENOUGH_CODE_UNITS,   4, JSEXN_SYNTAXERR, "{0} byte in UTF-8 must be followed by {1} bytes, but {2} byte{3} present")
+MSG_DEF(JSMSG_BAD_TRAILING_UTF8_UNIT,  1, JSEXN_SYNTAXERR, "bad trailing UTF-8 byte {0} doesn't match the pattern 0b10xxxxxx")
+MSG_DEF(JSMSG_FORBIDDEN_UTF8_CODE_POINT,2,JSEXN_SYNTAXERR, "{0} isn't a valid code point because {1}")
+MSG_DEF(JSMSG_BAD_CODE_UNITS,          1, JSEXN_NOTE, "the code units comprising this invalid code point were: {0}")
+
 // asm.js
 MSG_DEF(JSMSG_USE_ASM_TYPE_FAIL,       1, JSEXN_TYPEERR, "asm.js type error: {0}")
 MSG_DEF(JSMSG_USE_ASM_LINK_FAIL,       1, JSEXN_TYPEERR, "asm.js link error: {0}")
--- a/js/src/util/Unicode.h
+++ b/js/src/util/Unicode.h
@ -582,6 +582,19 @@ IsTrailSurrogate(uint32_t codePoint)
    return codePoint >= TrailSurrogateMin && codePoint <= TrailSurrogateMax;
 }

+/**
+ * True iff the given value is a UTF-16 surrogate.
+ *
+ * This function is intended for use in contexts where 32-bit values may need
+ * to be tested to see if they reside in the surrogate range, so it doesn't
+ * just take char16_t.
+ */
+inline bool
+IsSurrogate(uint32_t codePoint)
+{
+    return LeadSurrogateMin <= codePoint && codePoint <= TrailSurrogateMax;
+}
+
 inline char16_t
 LeadSurrogate(uint32_t codePoint)
 {