Bug 1434429 - Implement TokenStreamChars::matchMultiUnitCodePoint as a better nailing-down of behavior when processing a multi-code unit code point. r=arai

--HG--
extra : rebase_source : ff142c8a13969aec72b7acdbc77c66cff539e21f
This commit is contained in:
Jeff Walden 2018-01-18 11:34:27 -08:00
parent 576637a08c
commit e2925f1ee7
2 changed files with 57 additions and 30 deletions

View File

@ -1289,21 +1289,6 @@ IsTokenSane(Token* tp)
}
#endif
template<class AnyCharsAccess>
bool
TokenStreamChars<char16_t, AnyCharsAccess>::matchTrailForLeadSurrogate(char16_t lead,
uint32_t* codePoint)
{
int32_t maybeTrail = getCharIgnoreEOL();
if (!unicode::IsTrailSurrogate(maybeTrail)) {
ungetCharIgnoreEOL(maybeTrail);
return false;
}
*codePoint = unicode::UTF16Decode(lead, maybeTrail);
return true;
}
template<>
MOZ_MUST_USE bool
TokenStreamCharsBase<char16_t>::appendMultiUnitCodepointToTokenbuf(uint32_t codepoint)
@ -1314,6 +1299,24 @@ TokenStreamCharsBase<char16_t>::appendMultiUnitCodepointToTokenbuf(uint32_t code
return tokenbuf.append(lead) && tokenbuf.append(trail);
}
template<class AnyCharsAccess>
void
TokenStreamChars<char16_t, AnyCharsAccess>::matchMultiUnitCodePointSlow(char16_t lead,
uint32_t* codePoint)
{
MOZ_ASSERT(unicode::IsLeadSurrogate(lead),
"matchMultiUnitCodepoint should have ensured |lead| is a lead "
"surrogate");
int32_t maybeTrail = getCharIgnoreEOL();
if (MOZ_LIKELY(unicode::IsTrailSurrogate(maybeTrail))) {
*codePoint = unicode::UTF16Decode(lead, maybeTrail);
} else {
ungetCharIgnoreEOL(maybeTrail);
*codePoint = 0;
}
}
template<typename CharT, class AnyCharsAccess>
bool
TokenStreamSpecific<CharT, AnyCharsAccess>::putIdentInTokenbuf(const CharT* identStart)
@ -1331,7 +1334,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::putIdentInTokenbuf(const CharT* iden
int32_t c = getCharIgnoreEOL();
uint32_t codePoint;
if (isMultiUnitCodepoint(c, &codePoint)) {
if (!matchMultiUnitCodePoint(c, &codePoint))
return false;
if (codePoint) {
if (!unicode::IsIdentifierPart(codePoint))
break;
@ -1516,7 +1521,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
}
uint32_t codePoint = c;
if (isMultiUnitCodepoint(c, &codePoint) && unicode::IsUnicodeIDStart(codePoint)) {
if (!matchMultiUnitCodePoint(c, &codePoint))
goto error;
if (codePoint && unicode::IsUnicodeIDStart(codePoint)) {
hadUnicodeEscape = false;
goto identifier;
}
@ -1575,7 +1582,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
break;
uint32_t codePoint;
if (isMultiUnitCodepoint(c, &codePoint)) {
if (!matchMultiUnitCodePoint(c, &codePoint))
goto error;
if (codePoint) {
if (!unicode::IsIdentifierPart(codePoint))
break;
@ -1664,9 +1673,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
}
uint32_t codePoint;
if (isMultiUnitCodepoint(c, &codePoint) &&
unicode::IsIdentifierStart(codePoint))
{
if (!matchMultiUnitCodePoint(c, &codePoint))
goto error;
if (codePoint && unicode::IsIdentifierStart(codePoint)) {
reportError(JSMSG_IDSTART_AFTER_NUMBER);
goto error;
}
@ -1789,9 +1798,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
}
uint32_t codePoint;
if (isMultiUnitCodepoint(c, &codePoint) &&
unicode::IsIdentifierStart(codePoint))
{
if (!matchMultiUnitCodePoint(c, &codePoint))
goto error;
if (codePoint && unicode::IsIdentifierStart(codePoint)) {
reportError(JSMSG_IDSTART_AFTER_NUMBER);
goto error;
}

View File

@ -1046,7 +1046,7 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
using typename GeneralCharsBase::TokenStreamSpecific;
bool matchTrailForLeadSurrogate(char16_t lead, uint32_t* codePoint);
void matchMultiUnitCodePointSlow(char16_t lead, uint32_t* codePoint);
protected:
using GeneralCharsBase::anyCharsAccess;
@ -1056,11 +1056,29 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
using GeneralCharsBase::GeneralCharsBase;
MOZ_ALWAYS_INLINE bool isMultiUnitCodepoint(char16_t c, uint32_t* codepoint) {
// |c| must be the code unit just gotten. If it and the subsequent code
// unit form a valid surrogate pair, get the second code unit, set
// |*codePoint| to the code point encoded by the surrogate pair, and return
// true. Otherwise do not get a second code unit, set |*codePoint = 0|,
// and return true.
//
// ECMAScript specifically requires that unpaired UTF-16 surrogates be
// treated as the corresponding code point and not as an error. See
// <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
// Therefore this function always returns true. The |bool| return type
// exists so that a future UTF-8 |TokenStreamChars| can treat malformed
// multi-code unit UTF-8 sequences as errors. (Because ECMAScript only
// interprets UTF-16 inputs, the process of translating the UTF-8 to UTF-16
// would fail, so no script should execute. Technically, we shouldn't even
// be tokenizing -- but it probably isn't realistic to assume every user
// correctly passes only valid UTF-8, at least not without better types in
// our codebase for strings that by construction only contain valid UTF-8.)
MOZ_ALWAYS_INLINE bool matchMultiUnitCodePoint(char16_t c, uint32_t* codePoint) {
if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c)))
return false;
return matchTrailForLeadSurrogate(c, codepoint);
*codePoint = 0;
else
matchMultiUnitCodePointSlow(c, codePoint);
return true;
}
void ungetCodePointIgnoreEOL(uint32_t codePoint);
@ -1144,7 +1162,7 @@ class MOZ_STACK_CLASS TokenStreamSpecific
using CharsSharedBase::atomizeChars;
using CharsSharedBase::copyTokenbufTo;
using GeneralCharsBase::getCharIgnoreEOL;
using CharsBase::isMultiUnitCodepoint;
using CharsBase::matchMultiUnitCodePoint;
using CharsSharedBase::tokenbuf;
using GeneralCharsBase::ungetChar;
using CharsSharedBase::ungetCharIgnoreEOL;