mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-28 20:55:39 +00:00
Bug 1478045 - Implement SourceUnits::{peek,consumeKnown}CodePoint for the uncommon cases where a code point must be gotten, tested against one or more predicates, then sometimes ungotten based on those predicates. (This is unfortunately a bit subtle, but getting and ungetting is arguably worse, because ungetting has to unget a variable number of code units -- whereas peeking can compute that number of code units and then use it directly when the peeked code point is consumed, avoiding double-computation and increased potential for error.) r=arai
--HG-- extra : rebase_source : 893bb1905841f2c0cbd38e249389758894da6650
This commit is contained in:
parent
42d3c2092f
commit
f124682138
@ -511,6 +511,27 @@ TokenStreamAnyChars::undoInternalUpdateLineInfoForEOL()
|
||||
lineno--;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
|
||||
template<>
|
||||
inline void
|
||||
SourceUnits<char16_t>::assertNextCodePoint(const PeekedCodePoint<char16_t>& peeked)
|
||||
{
|
||||
char32_t c = peeked.codePoint();
|
||||
if (c < unicode::NonBMPMin) {
|
||||
MOZ_ASSERT(peeked.lengthInUnits() == 1);
|
||||
MOZ_ASSERT(ptr[0] == c);
|
||||
} else {
|
||||
MOZ_ASSERT(peeked.lengthInUnits() == 2);
|
||||
char16_t lead, trail;
|
||||
unicode::UTF16Encode(c, &lead, &trail);
|
||||
MOZ_ASSERT(ptr[0] == lead);
|
||||
MOZ_ASSERT(ptr[1] == trail);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // DEBUG
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
bool
|
||||
TokenStreamChars<char16_t, AnyCharsAccess>::getCodePoint(int32_t* cp)
|
||||
@ -1843,7 +1864,7 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
|
||||
// This loop runs more than once only when whitespace or comments are
|
||||
// encountered.
|
||||
do {
|
||||
int32_t unit = getCodeUnit();
|
||||
int32_t unit = peekCodeUnit();
|
||||
if (MOZ_UNLIKELY(unit == EOF)) {
|
||||
MOZ_ASSERT(this->sourceUnits.atEnd());
|
||||
anyCharsAccess().flags.isEOF = true;
|
||||
@ -1859,16 +1880,25 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
|
||||
// a variable number of code points, it's easier to assume it's an
|
||||
// identifier and maybe do a little wasted work, than to unget and
|
||||
// compute and reget if whitespace.
|
||||
TokenStart start(this->sourceUnits, -1);
|
||||
const CharT* identStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
|
||||
TokenStart start(this->sourceUnits, 0);
|
||||
const CharT* identStart = this->sourceUnits.addressOfNextCodeUnit();
|
||||
|
||||
int32_t codePoint;
|
||||
if (!getNonAsciiCodePoint(unit, &codePoint))
|
||||
PeekedCodePoint<CharT> peeked = this->sourceUnits.peekCodePoint();
|
||||
if (peeked.isNone()) {
|
||||
int32_t bad;
|
||||
MOZ_ALWAYS_FALSE(getCodePoint(&bad));
|
||||
return badToken();
|
||||
}
|
||||
|
||||
char32_t cp = peeked.codePoint();
|
||||
if (unicode::IsSpaceOrBOM2(cp)) {
|
||||
this->sourceUnits.consumeKnownCodePoint(peeked);
|
||||
if (IsLineTerminator(cp)) {
|
||||
if (!updateLineInfoForEOL())
|
||||
return badToken();
|
||||
|
||||
if (unicode::IsSpaceOrBOM2(codePoint)) {
|
||||
if (codePoint == '\n')
|
||||
anyCharsAccess().updateFlagsForEOL();
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
@ -1882,14 +1912,22 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
|
||||
"!IsUnicodeIDStart('_'), ensure that '_' is never "
|
||||
"handled here");
|
||||
|
||||
if (unicode::IsUnicodeIDStart(uint32_t(codePoint)))
|
||||
return identifierName(start, identStart, IdentifierEscapes::None, modifier, ttp);
|
||||
if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) {
|
||||
this->sourceUnits.consumeKnownCodePoint(peeked);
|
||||
MOZ_ASSERT(!IsLineTerminator(cp),
|
||||
"IdentifierStart must guarantee !IsLineTerminator "
|
||||
"or else we'll fail to maintain line-info/flags "
|
||||
"for EOL here");
|
||||
|
||||
return identifierName(start, identStart, IdentifierEscapes::None, modifier, ttp);
|
||||
}
|
||||
|
||||
ungetCodePointIgnoreEOL(codePoint);
|
||||
error(JSMSG_ILLEGAL_CHARACTER);
|
||||
return badToken();
|
||||
} // !isAsciiCodePoint(unit)
|
||||
|
||||
consumeKnownCodeUnit(unit);
|
||||
|
||||
// Get the token kind, based on the first char. The ordering of c1kind
|
||||
// comparison is based on the frequency of tokens in real code:
|
||||
// Parsemark (which represents typical JS code on the web) and the
|
||||
|
@ -195,6 +195,7 @@
|
||||
#include <algorithm>
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "jspubtd.h"
|
||||
@ -969,6 +970,125 @@ IsLineTerminator(char16_t unit)
|
||||
return IsLineTerminator(static_cast<char32_t>(unit));
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
struct SourceUnitTraits;
|
||||
|
||||
template<>
|
||||
struct SourceUnitTraits<char16_t>
|
||||
{
|
||||
public:
|
||||
static constexpr uint8_t maxUnitsLength = 2;
|
||||
|
||||
static constexpr size_t lengthInUnits(char32_t codePoint) {
|
||||
return codePoint < unicode::NonBMPMin ? 1 : 2;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct SourceUnitTraits<mozilla::Utf8Unit>
|
||||
{
|
||||
public:
|
||||
static constexpr uint8_t maxUnitsLength = 4;
|
||||
|
||||
static constexpr size_t lengthInUnits(char32_t codePoint) {
|
||||
return codePoint < 0x80
|
||||
? 1
|
||||
: codePoint < 0x800
|
||||
? 2
|
||||
: codePoint < 0x10000
|
||||
? 3
|
||||
: 4;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* PeekedCodePoint represents the result of peeking ahead in some source text
|
||||
* to determine the next validly-encoded code point.
|
||||
*
|
||||
* If there isn't a valid code point, then |isNone()|.
|
||||
*
|
||||
* But if there *is* a valid code point, then |!isNone()|, the code point has
|
||||
* value |codePoint()| and its length in code units is |lengthInUnits()|.
|
||||
*
|
||||
* Conceptually, this class is |Maybe<struct { char32_t v; uint8_t len; }>|.
|
||||
*/
|
||||
template<typename CharT>
|
||||
class PeekedCodePoint final
|
||||
{
|
||||
char32_t codePoint_ = 0;
|
||||
uint8_t lengthInUnits_ = 0;
|
||||
|
||||
private:
|
||||
using SourceUnitTraits = frontend::SourceUnitTraits<CharT>;
|
||||
|
||||
PeekedCodePoint() = default;
|
||||
|
||||
public:
|
||||
/**
|
||||
* Create a peeked code point with the given value and length in code
|
||||
* units.
|
||||
*
|
||||
* While the latter value is computable from the former for both UTF-8 and
|
||||
* JS's version of UTF-16, the caller likely computed a length in units in
|
||||
* the course of determining the peeked value. Passing both here avoids
|
||||
* recomputation and lets us do a consistency-checking assertion.
|
||||
*/
|
||||
PeekedCodePoint(char32_t codePoint, uint8_t lengthInUnits)
|
||||
: codePoint_(codePoint),
|
||||
lengthInUnits_(lengthInUnits)
|
||||
{
|
||||
MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
|
||||
MOZ_ASSERT(lengthInUnits != 0, "bad code point length");
|
||||
MOZ_ASSERT(lengthInUnits == SourceUnitTraits::lengthInUnits(codePoint));
|
||||
}
|
||||
|
||||
/** Create a PeekedCodeUnit that represents no valid code point. */
|
||||
static PeekedCodePoint none() {
|
||||
return PeekedCodePoint();
|
||||
}
|
||||
|
||||
/** True if no code point was found, false otherwise. */
|
||||
bool isNone() const {
|
||||
return lengthInUnits_ == 0;
|
||||
}
|
||||
|
||||
/** If a code point was found, its value. */
|
||||
char32_t codePoint() const {
|
||||
MOZ_ASSERT(!isNone());
|
||||
return codePoint_;
|
||||
}
|
||||
|
||||
/** If a code point was found, its length in code units. */
|
||||
uint8_t lengthInUnits() const {
|
||||
MOZ_ASSERT(!isNone());
|
||||
return lengthInUnits_;
|
||||
}
|
||||
};
|
||||
|
||||
inline PeekedCodePoint<char16_t>
|
||||
PeekCodePoint(const char16_t* const ptr, const char16_t* const end)
|
||||
{
|
||||
if (MOZ_UNLIKELY(ptr >= end))
|
||||
return PeekedCodePoint<char16_t>::none();
|
||||
|
||||
char16_t lead = ptr[0];
|
||||
|
||||
char32_t c;
|
||||
uint8_t len;
|
||||
if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
|
||||
MOZ_UNLIKELY(ptr + 1 >= end ||
|
||||
!unicode::IsTrailSurrogate(ptr[1])))
|
||||
{
|
||||
c = lead;
|
||||
len = 1;
|
||||
} else {
|
||||
c = unicode::UTF16Decode(lead, ptr[1]);
|
||||
len = 2;
|
||||
}
|
||||
|
||||
return PeekedCodePoint<char16_t>(c, len);
|
||||
}
|
||||
|
||||
// This is the low-level interface to the JS source code buffer. It just gets
|
||||
// raw Unicode code units -- 16-bit char16_t units of source text that are not
|
||||
// (always) full code points, and 8-bit units of UTF-8 source text soon.
|
||||
@ -1039,6 +1159,47 @@ class SourceUnits
|
||||
return *ptr; // this will nullptr-crash if poisoned
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the next code point in source text. The code point is not
|
||||
* normalized: '\r', '\n', '\u2028', and '\u2029' are returned literally.
|
||||
* If there is no next code point because |atEnd()|, or if an encoding
|
||||
* error is encountered, return a |PeekedCodePoint| that |isNone()|.
|
||||
*
|
||||
* This function does not report errors: code that attempts to get the next
|
||||
* code point must report any error.
|
||||
*
|
||||
* If a next code point is found, it may be consumed by passing it to
|
||||
* |consumeKnownCodePoint|.
|
||||
*/
|
||||
PeekedCodePoint<CharT> peekCodePoint() const {
|
||||
return PeekCodePoint(ptr, limit_);
|
||||
}
|
||||
|
||||
private:
|
||||
#ifdef DEBUG
|
||||
void assertNextCodePoint(const PeekedCodePoint<CharT>& peeked);
|
||||
#endif
|
||||
|
||||
public:
|
||||
/**
|
||||
* Consume a peeked code point that |!isNone()|.
|
||||
*
|
||||
* This call DOES NOT UPDATE LINE-STATUS. You may need to call
|
||||
* |updateLineInfoForEOL()| and |updateFlagsForEOL()| if this consumes a
|
||||
* LineTerminator. Note that if this consumes '\r', you also must consume
|
||||
* an optional '\n' (i.e. a full LineTerminatorSequence) before doing so.
|
||||
*/
|
||||
void consumeKnownCodePoint(const PeekedCodePoint<CharT>& peeked) {
|
||||
MOZ_ASSERT(!peeked.isNone());
|
||||
MOZ_ASSERT(peeked.lengthInUnits() <= remaining());
|
||||
|
||||
#ifdef DEBUG
|
||||
assertNextCodePoint(peeked);
|
||||
#endif
|
||||
|
||||
ptr += peeked.lengthInUnits();
|
||||
}
|
||||
|
||||
/** Match |n| hexadecimal digits and store their value in |*out|. */
|
||||
bool matchHexDigits(uint8_t n, char16_t* out) {
|
||||
MOZ_ASSERT(ptr, "shouldn't peek into poisoned SourceUnits");
|
||||
|
Loading…
Reference in New Issue
Block a user