Bug 1434429 - Implement TokenStreamChars::matchMultiUnitCodePoint as a better nailing-down of behavior when processing a multi-code unit code point. r=arai

--HG-- extra : rebase_source : a3de66ce2abb1b5399b725961e67771ef374a58d
2024-10-20 08:45:46 +00:00 · 2018-01-18 11:34:27 -08:00 · 2018-01-18 11:34:27 -08:00 · b3ac36a8a8
commit b3ac36a8a8
parent fb577bbcc0
2 changed files with 58 additions and 31 deletions
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@ -1289,21 +1289,6 @@ IsTokenSane(Token* tp)
 }
 #endif

-template<class AnyCharsAccess>
-bool
-TokenStreamChars<char16_t, AnyCharsAccess>::matchTrailForLeadSurrogate(char16_t lead,
-                                                                       uint32_t* codePoint)
-{
-    int32_t maybeTrail = getCharIgnoreEOL();
-    if (!unicode::IsTrailSurrogate(maybeTrail)) {
-        ungetCharIgnoreEOL(maybeTrail);
-        return false;
-    }
-
-    *codePoint = unicode::UTF16Decode(lead, maybeTrail);
-    return true;
-}
-
 template<>
 MOZ_MUST_USE bool
 TokenStreamCharsBase<char16_t>::appendMultiUnitCodepointToTokenbuf(uint32_t codepoint)
@ -1314,6 +1299,24 @@ TokenStreamCharsBase<char16_t>::appendMultiUnitCodepointToTokenbuf(uint32_t code
    return tokenbuf.append(lead) && tokenbuf.append(trail);
 }

+template<class AnyCharsAccess>
+void
+TokenStreamChars<char16_t, AnyCharsAccess>::matchMultiUnitCodePointSlow(char16_t lead,
+                                                                        uint32_t* codePoint)
+{
+    MOZ_ASSERT(unicode::IsLeadSurrogate(lead),
+               "matchMultiUnitCodepoint should have ensured |lead| is a lead "
+               "surrogate");
+
+    int32_t maybeTrail = getCharIgnoreEOL();
+    if (MOZ_LIKELY(unicode::IsTrailSurrogate(maybeTrail))) {
+        *codePoint = unicode::UTF16Decode(lead, maybeTrail);
+    } else {
+        ungetCharIgnoreEOL(maybeTrail);
+        *codePoint = 0;
+    }
+}
+
 template<typename CharT, class AnyCharsAccess>
 bool
 TokenStreamSpecific<CharT, AnyCharsAccess>::putIdentInTokenbuf(const CharT* identStart)
@ -1331,7 +1334,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::putIdentInTokenbuf(const CharT* iden
        int32_t c = getCharIgnoreEOL();

        uint32_t codePoint;
-        if (isMultiUnitCodepoint(c, &codePoint)) {
+        if (!matchMultiUnitCodePoint(c, &codePoint))
+            return false;
+        if (codePoint) {
            if (!unicode::IsIdentifierPart(codePoint))
                break;

@ -1516,7 +1521,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
        }

        uint32_t codePoint = c;
-        if (isMultiUnitCodepoint(c, &codePoint) && unicode::IsUnicodeIDStart(codePoint)) {
+        if (!matchMultiUnitCodePoint(c, &codePoint))
+            goto error;
+        if (codePoint && unicode::IsUnicodeIDStart(codePoint)) {
            hadUnicodeEscape = false;
            goto identifier;
        }
@ -1575,7 +1582,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
                break;

            uint32_t codePoint;
-            if (isMultiUnitCodepoint(c, &codePoint)) {
+            if (!matchMultiUnitCodePoint(c, &codePoint))
+                goto error;
+            if (codePoint) {
                if (!unicode::IsIdentifierPart(codePoint))
                    break;

@ -1664,9 +1673,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
            }

            uint32_t codePoint;
-            if (isMultiUnitCodepoint(c, &codePoint) &&
-                unicode::IsIdentifierStart(codePoint))
-            {
+            if (!matchMultiUnitCodePoint(c, &codePoint))
+                goto error;
+            if (codePoint && unicode::IsIdentifierStart(codePoint)) {
                reportError(JSMSG_IDSTART_AFTER_NUMBER);
                goto error;
            }
@ -1789,9 +1798,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
            }

            uint32_t codePoint;
-            if (isMultiUnitCodepoint(c, &codePoint) &&
-                unicode::IsIdentifierStart(codePoint))
-            {
+            if (!matchMultiUnitCodePoint(c, &codePoint))
+                goto error;
+            if (codePoint && unicode::IsIdentifierStart(codePoint)) {
                reportError(JSMSG_IDSTART_AFTER_NUMBER);
                goto error;
            }
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@ -1040,26 +1040,44 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
    using GeneralCharsBase = GeneralTokenStreamChars<char16_t, AnyCharsAccess>;
    using CharsSharedBase = TokenStreamCharsBase<char16_t>;

+    using GeneralCharsBase::asSpecific;
    using GeneralCharsBase::getCharIgnoreEOL;
    using CharsSharedBase::ungetCharIgnoreEOL;
    using CharsSharedBase::userbuf;

-    bool matchTrailForLeadSurrogate(char16_t lead, uint32_t* codePoint);
+    void matchMultiUnitCodePointSlow(char16_t lead, uint32_t* codePoint);

  public:
    using typename GeneralCharsBase::TokenStreamSpecific;

-    using GeneralCharsBase::asSpecific;
    using GeneralCharsBase::anyCharsAccess;

  public:
    using GeneralCharsBase::GeneralCharsBase;

-    MOZ_ALWAYS_INLINE bool isMultiUnitCodepoint(char16_t c, uint32_t* codepoint) {
+    // |c| must be the code unit just gotten.  If it and the subsequent code
+    // unit form a valid surrogate pair, get the second code unit, set
+    // |*codePoint| to the code point encoded by the surrogate pair, and return
+    // true.  Otherwise do not get a second code unit, set |*codePoint = 0|,
+    // and return true.
+    //
+    // ECMAScript specifically requires that unpaired UTF-16 surrogates be
+    // treated as the corresponding code point and not as an error.  See
+    // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
+    // Therefore this function always returns true.  The |bool| return type
+    // exists so that a future UTF-8 |TokenStreamChars| can treat malformed
+    // multi-code unit UTF-8 sequences as errors.  (Because ECMAScript only
+    // interprets UTF-16 inputs, the process of translating the UTF-8 to UTF-16
+    // would fail, so no script should execute.  Technically, we shouldn't even
+    // be tokenizing -- but it probably isn't realistic to assume every user
+    // correctly passes only valid UTF-8, at least not without better types in
+    // our codebase for strings that by construction only contain valid UTF-8.)
+    MOZ_ALWAYS_INLINE bool matchMultiUnitCodePoint(char16_t c, uint32_t* codePoint) {
        if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c)))
-            return false;
-
-        return matchTrailForLeadSurrogate(c, codepoint);
+            *codePoint = 0;
+        else
+            matchMultiUnitCodePointSlow(c, codePoint);
+        return true;
    }

    void ungetCodePointIgnoreEOL(uint32_t codePoint);
@ -1143,7 +1161,7 @@ class MOZ_STACK_CLASS TokenStreamSpecific
    using CharsSharedBase::atomizeChars;
    using CharsSharedBase::copyTokenbufTo;
    using GeneralCharsBase::getCharIgnoreEOL;
-    using CharsBase::isMultiUnitCodepoint;
+    using CharsBase::matchMultiUnitCodePoint;
    using CharsSharedBase::tokenbuf;
    using GeneralCharsBase::ungetChar;
    using CharsSharedBase::ungetCharIgnoreEOL;