mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-12-03 02:25:34 +00:00
Bug 1434429 - Implement TokenStreamChars::matchMultiUnitCodePoint as a better nailing-down of behavior when processing a multi-code unit code point. r=arai
--HG-- extra : rebase_source : ff142c8a13969aec72b7acdbc77c66cff539e21f
This commit is contained in:
parent
576637a08c
commit
e2925f1ee7
@ -1289,21 +1289,6 @@ IsTokenSane(Token* tp)
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
bool
|
||||
TokenStreamChars<char16_t, AnyCharsAccess>::matchTrailForLeadSurrogate(char16_t lead,
|
||||
uint32_t* codePoint)
|
||||
{
|
||||
int32_t maybeTrail = getCharIgnoreEOL();
|
||||
if (!unicode::IsTrailSurrogate(maybeTrail)) {
|
||||
ungetCharIgnoreEOL(maybeTrail);
|
||||
return false;
|
||||
}
|
||||
|
||||
*codePoint = unicode::UTF16Decode(lead, maybeTrail);
|
||||
return true;
|
||||
}
|
||||
|
||||
template<>
|
||||
MOZ_MUST_USE bool
|
||||
TokenStreamCharsBase<char16_t>::appendMultiUnitCodepointToTokenbuf(uint32_t codepoint)
|
||||
@ -1314,6 +1299,24 @@ TokenStreamCharsBase<char16_t>::appendMultiUnitCodepointToTokenbuf(uint32_t code
|
||||
return tokenbuf.append(lead) && tokenbuf.append(trail);
|
||||
}
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
void
|
||||
TokenStreamChars<char16_t, AnyCharsAccess>::matchMultiUnitCodePointSlow(char16_t lead,
|
||||
uint32_t* codePoint)
|
||||
{
|
||||
MOZ_ASSERT(unicode::IsLeadSurrogate(lead),
|
||||
"matchMultiUnitCodepoint should have ensured |lead| is a lead "
|
||||
"surrogate");
|
||||
|
||||
int32_t maybeTrail = getCharIgnoreEOL();
|
||||
if (MOZ_LIKELY(unicode::IsTrailSurrogate(maybeTrail))) {
|
||||
*codePoint = unicode::UTF16Decode(lead, maybeTrail);
|
||||
} else {
|
||||
ungetCharIgnoreEOL(maybeTrail);
|
||||
*codePoint = 0;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename CharT, class AnyCharsAccess>
|
||||
bool
|
||||
TokenStreamSpecific<CharT, AnyCharsAccess>::putIdentInTokenbuf(const CharT* identStart)
|
||||
@ -1331,7 +1334,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::putIdentInTokenbuf(const CharT* iden
|
||||
int32_t c = getCharIgnoreEOL();
|
||||
|
||||
uint32_t codePoint;
|
||||
if (isMultiUnitCodepoint(c, &codePoint)) {
|
||||
if (!matchMultiUnitCodePoint(c, &codePoint))
|
||||
return false;
|
||||
if (codePoint) {
|
||||
if (!unicode::IsIdentifierPart(codePoint))
|
||||
break;
|
||||
|
||||
@ -1516,7 +1521,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
|
||||
}
|
||||
|
||||
uint32_t codePoint = c;
|
||||
if (isMultiUnitCodepoint(c, &codePoint) && unicode::IsUnicodeIDStart(codePoint)) {
|
||||
if (!matchMultiUnitCodePoint(c, &codePoint))
|
||||
goto error;
|
||||
if (codePoint && unicode::IsUnicodeIDStart(codePoint)) {
|
||||
hadUnicodeEscape = false;
|
||||
goto identifier;
|
||||
}
|
||||
@ -1575,7 +1582,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
|
||||
break;
|
||||
|
||||
uint32_t codePoint;
|
||||
if (isMultiUnitCodepoint(c, &codePoint)) {
|
||||
if (!matchMultiUnitCodePoint(c, &codePoint))
|
||||
goto error;
|
||||
if (codePoint) {
|
||||
if (!unicode::IsIdentifierPart(codePoint))
|
||||
break;
|
||||
|
||||
@ -1664,9 +1673,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
|
||||
}
|
||||
|
||||
uint32_t codePoint;
|
||||
if (isMultiUnitCodepoint(c, &codePoint) &&
|
||||
unicode::IsIdentifierStart(codePoint))
|
||||
{
|
||||
if (!matchMultiUnitCodePoint(c, &codePoint))
|
||||
goto error;
|
||||
if (codePoint && unicode::IsIdentifierStart(codePoint)) {
|
||||
reportError(JSMSG_IDSTART_AFTER_NUMBER);
|
||||
goto error;
|
||||
}
|
||||
@ -1789,9 +1798,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* ttp, Mod
|
||||
}
|
||||
|
||||
uint32_t codePoint;
|
||||
if (isMultiUnitCodepoint(c, &codePoint) &&
|
||||
unicode::IsIdentifierStart(codePoint))
|
||||
{
|
||||
if (!matchMultiUnitCodePoint(c, &codePoint))
|
||||
goto error;
|
||||
if (codePoint && unicode::IsIdentifierStart(codePoint)) {
|
||||
reportError(JSMSG_IDSTART_AFTER_NUMBER);
|
||||
goto error;
|
||||
}
|
||||
|
@ -1046,7 +1046,7 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
|
||||
|
||||
using typename GeneralCharsBase::TokenStreamSpecific;
|
||||
|
||||
bool matchTrailForLeadSurrogate(char16_t lead, uint32_t* codePoint);
|
||||
void matchMultiUnitCodePointSlow(char16_t lead, uint32_t* codePoint);
|
||||
|
||||
protected:
|
||||
using GeneralCharsBase::anyCharsAccess;
|
||||
@ -1056,11 +1056,29 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
|
||||
|
||||
using GeneralCharsBase::GeneralCharsBase;
|
||||
|
||||
MOZ_ALWAYS_INLINE bool isMultiUnitCodepoint(char16_t c, uint32_t* codepoint) {
|
||||
// |c| must be the code unit just gotten. If it and the subsequent code
|
||||
// unit form a valid surrogate pair, get the second code unit, set
|
||||
// |*codePoint| to the code point encoded by the surrogate pair, and return
|
||||
// true. Otherwise do not get a second code unit, set |*codePoint = 0|,
|
||||
// and return true.
|
||||
//
|
||||
// ECMAScript specifically requires that unpaired UTF-16 surrogates be
|
||||
// treated as the corresponding code point and not as an error. See
|
||||
// <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
|
||||
// Therefore this function always returns true. The |bool| return type
|
||||
// exists so that a future UTF-8 |TokenStreamChars| can treat malformed
|
||||
// multi-code unit UTF-8 sequences as errors. (Because ECMAScript only
|
||||
// interprets UTF-16 inputs, the process of translating the UTF-8 to UTF-16
|
||||
// would fail, so no script should execute. Technically, we shouldn't even
|
||||
// be tokenizing -- but it probably isn't realistic to assume every user
|
||||
// correctly passes only valid UTF-8, at least not without better types in
|
||||
// our codebase for strings that by construction only contain valid UTF-8.)
|
||||
MOZ_ALWAYS_INLINE bool matchMultiUnitCodePoint(char16_t c, uint32_t* codePoint) {
|
||||
if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c)))
|
||||
return false;
|
||||
|
||||
return matchTrailForLeadSurrogate(c, codepoint);
|
||||
*codePoint = 0;
|
||||
else
|
||||
matchMultiUnitCodePointSlow(c, codePoint);
|
||||
return true;
|
||||
}
|
||||
|
||||
void ungetCodePointIgnoreEOL(uint32_t codePoint);
|
||||
@ -1144,7 +1162,7 @@ class MOZ_STACK_CLASS TokenStreamSpecific
|
||||
using CharsSharedBase::atomizeChars;
|
||||
using CharsSharedBase::copyTokenbufTo;
|
||||
using GeneralCharsBase::getCharIgnoreEOL;
|
||||
using CharsBase::isMultiUnitCodepoint;
|
||||
using CharsBase::matchMultiUnitCodePoint;
|
||||
using CharsSharedBase::tokenbuf;
|
||||
using GeneralCharsBase::ungetChar;
|
||||
using CharsSharedBase::ungetCharIgnoreEOL;
|
||||
|
Loading…
Reference in New Issue
Block a user