Bug 1478045 - Implement SourceUnits::{peek,consumeKnown}CodePoint for the uncommon cases where a code point must be gotten, tested against one or more predicates, then sometimes ungotten based on those predicates. (This is unfortunately a bit subtle, but getting and ungetting is arguably worse, because ungetting has to unget a variable number of code units -- whereas peeking can compute that number of code units and then use it directly when the peeked code point is consumed, avoiding double-computation and increased potential for error.) r=arai

--HG--
extra : rebase_source : 893bb1905841f2c0cbd38e249389758894da6650
This commit is contained in:
Jeff Walden 2018-07-25 14:24:23 -07:00
parent 42d3c2092f
commit f124682138
2 changed files with 209 additions and 10 deletions

View File

@ -511,6 +511,27 @@ TokenStreamAnyChars::undoInternalUpdateLineInfoForEOL()
lineno--;
}
#ifdef DEBUG
template<>
inline void
SourceUnits<char16_t>::assertNextCodePoint(const PeekedCodePoint<char16_t>& peeked)
{
char32_t c = peeked.codePoint();
if (c < unicode::NonBMPMin) {
MOZ_ASSERT(peeked.lengthInUnits() == 1);
MOZ_ASSERT(ptr[0] == c);
} else {
MOZ_ASSERT(peeked.lengthInUnits() == 2);
char16_t lead, trail;
unicode::UTF16Encode(c, &lead, &trail);
MOZ_ASSERT(ptr[0] == lead);
MOZ_ASSERT(ptr[1] == trail);
}
}
#endif // DEBUG
template<class AnyCharsAccess>
bool
TokenStreamChars<char16_t, AnyCharsAccess>::getCodePoint(int32_t* cp)
@ -1843,7 +1864,7 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
// This loop runs more than once only when whitespace or comments are
// encountered.
do {
int32_t unit = getCodeUnit();
int32_t unit = peekCodeUnit();
if (MOZ_UNLIKELY(unit == EOF)) {
MOZ_ASSERT(this->sourceUnits.atEnd());
anyCharsAccess().flags.isEOF = true;
@ -1859,16 +1880,25 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
// a variable number of code points, it's easier to assume it's an
// identifier and maybe do a little wasted work, than to unget and
// compute and reget if whitespace.
TokenStart start(this->sourceUnits, -1);
const CharT* identStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
TokenStart start(this->sourceUnits, 0);
const CharT* identStart = this->sourceUnits.addressOfNextCodeUnit();
int32_t codePoint;
if (!getNonAsciiCodePoint(unit, &codePoint))
PeekedCodePoint<CharT> peeked = this->sourceUnits.peekCodePoint();
if (peeked.isNone()) {
int32_t bad;
MOZ_ALWAYS_FALSE(getCodePoint(&bad));
return badToken();
}
char32_t cp = peeked.codePoint();
if (unicode::IsSpaceOrBOM2(cp)) {
this->sourceUnits.consumeKnownCodePoint(peeked);
if (IsLineTerminator(cp)) {
if (!updateLineInfoForEOL())
return badToken();
if (unicode::IsSpaceOrBOM2(codePoint)) {
if (codePoint == '\n')
anyCharsAccess().updateFlagsForEOL();
}
continue;
}
@ -1882,14 +1912,22 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
"!IsUnicodeIDStart('_'), ensure that '_' is never "
"handled here");
if (unicode::IsUnicodeIDStart(uint32_t(codePoint)))
return identifierName(start, identStart, IdentifierEscapes::None, modifier, ttp);
if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) {
this->sourceUnits.consumeKnownCodePoint(peeked);
MOZ_ASSERT(!IsLineTerminator(cp),
"IdentifierStart must guarantee !IsLineTerminator "
"or else we'll fail to maintain line-info/flags "
"for EOL here");
return identifierName(start, identStart, IdentifierEscapes::None, modifier, ttp);
}
ungetCodePointIgnoreEOL(codePoint);
error(JSMSG_ILLEGAL_CHARACTER);
return badToken();
} // !isAsciiCodePoint(unit)
consumeKnownCodeUnit(unit);
// Get the token kind, based on the first char. The ordering of c1kind
// comparison is based on the frequency of tokens in real code:
// Parsemark (which represents typical JS code on the web) and the

View File

@ -195,6 +195,7 @@
#include <algorithm>
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include "jspubtd.h"
@ -969,6 +970,125 @@ IsLineTerminator(char16_t unit)
return IsLineTerminator(static_cast<char32_t>(unit));
}
template<typename CharT>
struct SourceUnitTraits;
template<>
struct SourceUnitTraits<char16_t>
{
public:
static constexpr uint8_t maxUnitsLength = 2;
static constexpr size_t lengthInUnits(char32_t codePoint) {
return codePoint < unicode::NonBMPMin ? 1 : 2;
}
};
template<>
struct SourceUnitTraits<mozilla::Utf8Unit>
{
public:
static constexpr uint8_t maxUnitsLength = 4;
static constexpr size_t lengthInUnits(char32_t codePoint) {
return codePoint < 0x80
? 1
: codePoint < 0x800
? 2
: codePoint < 0x10000
? 3
: 4;
}
};
/**
* PeekedCodePoint represents the result of peeking ahead in some source text
* to determine the next validly-encoded code point.
*
* If there isn't a valid code point, then |isNone()|.
*
* But if there *is* a valid code point, then |!isNone()|, the code point has
* value |codePoint()| and its length in code units is |lengthInUnits()|.
*
* Conceptually, this class is |Maybe<struct { char32_t v; uint8_t len; }>|.
*/
template<typename CharT>
class PeekedCodePoint final
{
char32_t codePoint_ = 0;
uint8_t lengthInUnits_ = 0;
private:
using SourceUnitTraits = frontend::SourceUnitTraits<CharT>;
PeekedCodePoint() = default;
public:
/**
* Create a peeked code point with the given value and length in code
* units.
*
* While the latter value is computable from the former for both UTF-8 and
* JS's version of UTF-16, the caller likely computed a length in units in
* the course of determining the peeked value. Passing both here avoids
* recomputation and lets us do a consistency-checking assertion.
*/
PeekedCodePoint(char32_t codePoint, uint8_t lengthInUnits)
: codePoint_(codePoint),
lengthInUnits_(lengthInUnits)
{
MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
MOZ_ASSERT(lengthInUnits != 0, "bad code point length");
MOZ_ASSERT(lengthInUnits == SourceUnitTraits::lengthInUnits(codePoint));
}
/** Create a PeekedCodeUnit that represents no valid code point. */
static PeekedCodePoint none() {
return PeekedCodePoint();
}
/** True if no code point was found, false otherwise. */
bool isNone() const {
return lengthInUnits_ == 0;
}
/** If a code point was found, its value. */
char32_t codePoint() const {
MOZ_ASSERT(!isNone());
return codePoint_;
}
/** If a code point was found, its length in code units. */
uint8_t lengthInUnits() const {
MOZ_ASSERT(!isNone());
return lengthInUnits_;
}
};
inline PeekedCodePoint<char16_t>
PeekCodePoint(const char16_t* const ptr, const char16_t* const end)
{
if (MOZ_UNLIKELY(ptr >= end))
return PeekedCodePoint<char16_t>::none();
char16_t lead = ptr[0];
char32_t c;
uint8_t len;
if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
MOZ_UNLIKELY(ptr + 1 >= end ||
!unicode::IsTrailSurrogate(ptr[1])))
{
c = lead;
len = 1;
} else {
c = unicode::UTF16Decode(lead, ptr[1]);
len = 2;
}
return PeekedCodePoint<char16_t>(c, len);
}
// This is the low-level interface to the JS source code buffer. It just gets
// raw Unicode code units -- 16-bit char16_t units of source text that are not
// (always) full code points, and 8-bit units of UTF-8 source text soon.
@ -1039,6 +1159,47 @@ class SourceUnits
return *ptr; // this will nullptr-crash if poisoned
}
/**
* Determine the next code point in source text. The code point is not
* normalized: '\r', '\n', '\u2028', and '\u2029' are returned literally.
* If there is no next code point because |atEnd()|, or if an encoding
* error is encountered, return a |PeekedCodePoint| that |isNone()|.
*
* This function does not report errors: code that attempts to get the next
* code point must report any error.
*
* If a next code point is found, it may be consumed by passing it to
* |consumeKnownCodePoint|.
*/
PeekedCodePoint<CharT> peekCodePoint() const {
return PeekCodePoint(ptr, limit_);
}
private:
#ifdef DEBUG
void assertNextCodePoint(const PeekedCodePoint<CharT>& peeked);
#endif
public:
/**
* Consume a peeked code point that |!isNone()|.
*
* This call DOES NOT UPDATE LINE-STATUS. You may need to call
* |updateLineInfoForEOL()| and |updateFlagsForEOL()| if this consumes a
* LineTerminator. Note that if this consumes '\r', you also must consume
* an optional '\n' (i.e. a full LineTerminatorSequence) before doing so.
*/
void consumeKnownCodePoint(const PeekedCodePoint<CharT>& peeked) {
MOZ_ASSERT(!peeked.isNone());
MOZ_ASSERT(peeked.lengthInUnits() <= remaining());
#ifdef DEBUG
assertNextCodePoint(peeked);
#endif
ptr += peeked.lengthInUnits();
}
/** Match |n| hexadecimal digits and store their value in |*out|. */
bool matchHexDigits(uint8_t n, char16_t* out) {
MOZ_ASSERT(ptr, "shouldn't peek into poisoned SourceUnits");