Bug 1478170 - Implement getNonAsciiCodePointDontNormalize for UTF-8. r=arai

--HG--
extra : rebase_source : ab9e28fc001eaab9af3bcb072ce783a88d0f7f07
This commit is contained in:
Jeff Walden 2018-07-18 22:46:47 -07:00
parent ab4f689aa3
commit cf12902522
4 changed files with 411 additions and 0 deletions

View File

@ -571,6 +571,193 @@ SourceUnits<char16_t>::assertNextCodePoint(const PeekedCodePoint<char16_t>& peek
#endif // DEBUG
template<class AnyCharsAccess>
MOZ_COLD void
TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(uint8_t relevantUnits,
unsigned errorNumber, ...)
{
va_list args;
va_start(args, errorNumber);
do {
size_t offset = this->sourceUnits.offset();
ErrorMetadata err;
TokenStreamAnyChars& anyChars = anyCharsAccess();
if (bool hasLineOfContext = anyChars.fillExcludingContext(&err, offset)) {
if (!internalComputeLineOfContext(&err, offset))
break;
// As this is an encoding error, the computed window-end must be
// identical to the location of the error -- any further on and the
// window would contain invalid Unicode.
MOZ_ASSERT_IF(err.lineOfContext != nullptr,
err.lineLength == err.tokenOffset);
}
auto notes = MakeUnique<JSErrorNotes>();
if (!notes) {
ReportOutOfMemory(anyChars.cx);
break;
}
// The largest encoding of a UTF-8 code point is 4 units. (Encoding an
// obsolete 5- or 6-byte code point will complain only about a bad lead
// code unit.)
constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
MOZ_ASSERT(relevantUnits > 0);
char badUnitsStr[MaxWidth];
char* ptr = badUnitsStr;
while (relevantUnits > 0) {
byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
ptr[4] = ' ';
ptr += 5;
relevantUnits--;
}
ptr[-1] = '\0';
uint32_t line, column;
anyChars.srcCoords.lineNumAndColumnIndex(offset, &line, &column);
if (!notes->addNoteASCII(anyChars.cx, anyChars.getFilename(), line, column,
GetErrorMessage, nullptr, JSMSG_BAD_CODE_UNITS, badUnitsStr))
{
break;
}
ReportCompileError(anyChars.cx, std::move(err), std::move(notes), JSREPORT_ERROR,
errorNumber, args);
} while (false);
va_end(args);
}
template<class AnyCharsAccess>
MOZ_COLD void
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(Utf8Unit lead)
{
uint8_t leadValue = lead.toUint8();
char leadByteStr[5];
byteToTerminatedString(leadValue, leadByteStr);
internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
}
template<class AnyCharsAccess>
MOZ_COLD void
TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(Utf8Unit lead,
uint8_t remaining, uint8_t required)
{
uint8_t leadValue = lead.toUint8();
MOZ_ASSERT(required == 2 || required == 3 || required == 4);
MOZ_ASSERT(remaining < 4);
MOZ_ASSERT(remaining < required);
char leadByteStr[5];
byteToTerminatedString(leadValue, leadByteStr);
// |toHexChar| produces the desired decimal numbers for values < 4.
const char expectedStr[] = { toHexChar(required - 1), '\0' };
const char actualStr[] = { toHexChar(remaining - 1), '\0' };
internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS,
leadByteStr, expectedStr, actualStr, remaining == 2 ? " was" : "s were");
}
template<class AnyCharsAccess>
MOZ_COLD void
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(Utf8Unit badUnit,
uint8_t unitsObserved)
{
char badByteStr[5];
byteToTerminatedString(badUnit.toUint8(), badByteStr);
internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT, badByteStr);
}
template<class AnyCharsAccess>
MOZ_COLD void
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(uint32_t codePoint,
uint8_t codePointLength,
const char* reason)
{
// Construct a string like "0x203D" (including null terminator) to include
// in the error message. Write the string end-to-start from end to start
// of an adequately sized |char| array, shifting least significant nibbles
// off the number and writing the corresponding hex digits until done, then
// prefixing with "0x". |codePointStr| points at the incrementally
// computed string, within |codePointCharsArray|'s bounds.
// 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
// bits in a four-byte UTF-8 code unit sequence.
constexpr size_t MaxHexSize = sizeof("0x1F" "FFFF"); // including '\0'
char codePointCharsArray[MaxHexSize];
char* codePointStr = codePointCharsArray + ArrayLength(codePointCharsArray);
*--codePointStr = '\0';
uint32_t copy = codePoint;
while (copy) {
MOZ_ASSERT(codePointCharsArray < codePointStr);
*--codePointStr = toHexChar(copy & 0xF);
copy >>= 4;
}
MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
*--codePointStr = 'x';
*--codePointStr = '0';
internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT, codePointStr, reason);
}
template<class AnyCharsAccess>
MOZ_MUST_USE bool
TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(Utf8Unit lead,
char32_t* codePoint)
{
auto onBadLeadUnit = [this, &lead]() {
this->badLeadUnit(lead);
};
auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
this->notEnoughUnits(lead, remaining, required);
};
auto onBadTrailingUnit = [this, &lead](uint8_t unitsObserved) {
this->badTrailingUnit(lead, unitsObserved);
};
auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
this->badCodePoint(badCodePoint, unitsObserved);
};
auto onNotShortestForm = [this](char32_t badCodePoint, uint8_t unitsObserved) {
this->notShortestForm(badCodePoint, unitsObserved);
};
// If a valid code point is decoded, this function call consumes its code
// units. If not, it ungets the lead code unit and invokes the right error
// handler, so on failure we must immediately return false.
SourceUnitsIterator iter(this->sourceUnits);
Maybe<char32_t> maybeCodePoint =
DecodeOneUtf8CodePointInline(lead, &iter, SourceUnitsEnd(),
onBadLeadUnit, onNotEnoughUnits, onBadTrailingUnit,
onBadCodePoint, onNotShortestForm);
if (maybeCodePoint.isNothing())
return false;
*codePoint = maybeCodePoint.value();
return true;
}
template<class AnyCharsAccess>
bool
TokenStreamChars<char16_t, AnyCharsAccess>::getCodePoint(int32_t* cp)

View File

@ -1163,6 +1163,10 @@ class SourceUnits
return base_ + (offset - startOffset_);
}
const CharT* current() const {
return ptr;
}
const CharT* limit() const {
return limit_;
}
@ -1649,6 +1653,88 @@ class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
protected:
// These APIs are only usable by UTF-8-specific code.
using typename CharsBase::SourceUnits;
/**
* A mutable iterator-wrapper around |SourceUnits| that translates
* operators to calls to |SourceUnits::getCodeUnit()| and similar.
*
* This class is expected to be used in concert with |SourceUnitsEnd|.
*/
class SourceUnitsIterator
{
SourceUnits& sourceUnits_;
#ifdef DEBUG
// In iterator copies created by the post-increment operator, a pointer
// at the next source text code unit when the post-increment operator
// was called, cleared when the iterator is dereferenced.
mutable mozilla::Maybe<const mozilla::Utf8Unit*> currentBeforePostIncrement_;
#endif
public:
explicit SourceUnitsIterator(SourceUnits& sourceUnits)
: sourceUnits_(sourceUnits)
{}
mozilla::Utf8Unit operator*() const {
// operator* is expected to get the *next* value from an iterator
// not pointing at the end of the underlying range. However, the
// sole use of this is in the context of an expression of the form
// |*iter++|, that performed the |sourceUnits_.getCodeUnit()| in
// the |operator++(int)| below -- so dereferencing acts on a
// |sourceUnits_| already advanced. Therefore the correct unit to
// return is the previous one.
MOZ_ASSERT(currentBeforePostIncrement_.value() + 1 == sourceUnits_.current());
#ifdef DEBUG
currentBeforePostIncrement_.reset();
#endif
return sourceUnits_.previousCodeUnit();
}
SourceUnitsIterator operator++(int) {
MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
"the only valid operation on a post-incremented "
"iterator is dereferencing a single time");
SourceUnitsIterator copy = *this;
#ifdef DEBUG
copy.currentBeforePostIncrement_.emplace(sourceUnits_.current());
#endif
sourceUnits_.getCodeUnit();
return copy;
}
void operator-=(size_t n) {
MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
"the only valid operation on a post-incremented "
"iterator is dereferencing a single time");
sourceUnits_.unskipCodeUnits(n);
}
mozilla::Utf8Unit operator[](ptrdiff_t index) {
MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
"the only valid operation on a post-incremented "
"iterator is dereferencing a single time");
MOZ_ASSERT(index == -1,
"must only be called to verify the value of the "
"previous code unit");
return sourceUnits_.previousCodeUnit();
}
size_t remaining() const {
MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
"the only valid operation on a post-incremented "
"iterator is dereferencing a single time");
return sourceUnits_.remaining();
}
};
/** A sentinel representing the end of |SourceUnits| data. */
class SourceUnitsEnd {};
friend inline size_t operator-(const SourceUnitsEnd& aEnd, const SourceUnitsIterator& aIter);
protected:
// These APIs are in both SpecializedTokenStreamCharsBase specializations
// and so are usable in subclasses no matter what CharT is.
@ -1656,6 +1742,13 @@ class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
using CharsBase::CharsBase;
};
inline size_t
operator-(const SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>::SourceUnitsEnd& aEnd,
const SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>::SourceUnitsIterator& aIter)
{
return aIter.remaining();
}
/** A small class encapsulating computation of the start-offset of a Token. */
class TokenStart
{
@ -1963,11 +2056,122 @@ class TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>
using GeneralCharsBase = GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
using Self = TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
using typename SpecializedCharsBase::SourceUnitsEnd;
using typename SpecializedCharsBase::SourceUnitsIterator;
protected:
using GeneralCharsBase::anyCharsAccess;
using GeneralCharsBase::internalComputeLineOfContext;
using TokenStreamCharsShared::isAsciiCodePoint;
// Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
private:
static char toHexChar(uint8_t nibble) {
MOZ_ASSERT(nibble < 16);
return "0123456789ABCDEF"[nibble];
}
static void byteToString(uint8_t n, char* str) {
str[0] = '0';
str[1] = 'x';
str[2] = toHexChar(n >> 4);
str[3] = toHexChar(n & 0xF);
}
static void byteToTerminatedString(uint8_t n, char* str) {
byteToString(n, str);
str[4] = '\0';
}
/**
* Report a UTF-8 encoding-related error for a code point starting AT THE
* CURRENT OFFSET.
*
* |relevantUnits| indicates how many code units from the current offset
* are potentially relevant to the reported error, such that they may be
* included in the error message. For example, if at the current offset we
* have
*
* 0b1111'1111 ...
*
* a code unit never allowed in UTF-8, then |relevantUnits| might be 1
* because only that unit is relevant. Or if we have
*
* 0b1111'0111 0b1011'0101 0b0000'0000 ...
*
* where the first two code units are a valid prefix to a four-unit code
* point but the third unit *isn't* a valid trailing code unit, then
* |relevantUnits| might be 3.
*/
MOZ_COLD void internalEncodingError(uint8_t relevantUnits, unsigned errorNumber, ...);
// Don't use |internalEncodingError|! Use one of the elaborated functions
// that calls it, below -- all of which should be used to indicate an error
// in a code point starting AT THE CURRENT OFFSET as with
// |internalEncodingError|.
/** Report an error for an invalid lead code unit |lead|. */
MOZ_COLD void badLeadUnit(mozilla::Utf8Unit lead);
/**
* Report an error when there aren't enough code units remaining to
* constitute a full code point after |lead|: only |remaining| code units
* were available for a code point starting with |lead|, when at least
* |required| code units were required.
*/
MOZ_COLD void notEnoughUnits(mozilla::Utf8Unit lead, uint8_t remaining, uint8_t required);
/**
* Report an error for a bad trailing UTF-8 code unit, where the bad
* trailing unit was the last of |unitsObserved| units examined from the
* current offset.
*/
MOZ_COLD void badTrailingUnit(mozilla::Utf8Unit badUnit, uint8_t unitsObserved);
// Helper used for both |badCodePoint| and |notShortestForm| for code units
// that have all the requisite high bits set/unset in a manner that *could*
// encode a valid code point, but the remaining bits encoding its actual
// value do not define a permitted value.
MOZ_COLD void badStructurallyValidCodePoint(uint32_t codePoint, uint8_t codePointLength,
const char* reason);
/**
* Report an error for UTF-8 that encodes a UTF-16 surrogate or a number
* outside the Unicode range.
*/
MOZ_COLD void badCodePoint(uint32_t codePoint, uint8_t codePointLength) {
MOZ_ASSERT(unicode::IsSurrogate(codePoint) || codePoint > unicode::NonBMPMax);
badStructurallyValidCodePoint(codePoint, codePointLength,
unicode::IsSurrogate(codePoint)
? "it's a UTF-16 surrogate"
: "the maximum code point is U+10FFFF");
}
/**
* Report an error for UTF-8 that encodes a code point not in its shortest
* form.
*/
MOZ_COLD void notShortestForm(uint32_t codePoint, uint8_t codePointLength) {
MOZ_ASSERT(!unicode::IsSurrogate(codePoint));
MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
badStructurallyValidCodePoint(codePoint, codePointLength,
"it wasn't encoded in shortest possible form");
}
protected:
using GeneralCharsBase::GeneralCharsBase;
/**
* Given the non-ASCII |lead| code unit just consumed, consume the rest of
* a non-ASCII code point. The code point is not normalized: on success
* |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
*
* Report an error if an invalid code point is encountered.
*/
MOZ_MUST_USE bool
getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead, char32_t* codePoint);
};
// TokenStream is the lexical scanner for JavaScript source text.

View File

@ -352,6 +352,13 @@ MSG_DEF(JSMSG_DEFAULT_IN_PATTERN, 0, JSEXN_SYNTAXERR, "destructuring defaul
MSG_DEF(JSMSG_BAD_NEWTARGET, 0, JSEXN_SYNTAXERR, "new.target only allowed within functions")
MSG_DEF(JSMSG_ESCAPED_KEYWORD, 0, JSEXN_SYNTAXERR, "keywords must be written literally, without embedded escapes")
// UTF-8 source text encoding errors
MSG_DEF(JSMSG_BAD_LEADING_UTF8_UNIT, 1, JSEXN_SYNTAXERR, "{0} byte doesn't begin a valid UTF-8 code point")
MSG_DEF(JSMSG_NOT_ENOUGH_CODE_UNITS, 4, JSEXN_SYNTAXERR, "{0} byte in UTF-8 must be followed by {1} bytes, but {2} byte{3} present")
MSG_DEF(JSMSG_BAD_TRAILING_UTF8_UNIT, 1, JSEXN_SYNTAXERR, "bad trailing UTF-8 byte {0} doesn't match the pattern 0b10xxxxxx")
MSG_DEF(JSMSG_FORBIDDEN_UTF8_CODE_POINT,2,JSEXN_SYNTAXERR, "{0} isn't a valid code point because {1}")
MSG_DEF(JSMSG_BAD_CODE_UNITS, 1, JSEXN_NOTE, "the code units comprising this invalid code point were: {0}")
// asm.js
MSG_DEF(JSMSG_USE_ASM_TYPE_FAIL, 1, JSEXN_TYPEERR, "asm.js type error: {0}")
MSG_DEF(JSMSG_USE_ASM_LINK_FAIL, 1, JSEXN_TYPEERR, "asm.js link error: {0}")

View File

@ -582,6 +582,19 @@ IsTrailSurrogate(uint32_t codePoint)
return codePoint >= TrailSurrogateMin && codePoint <= TrailSurrogateMax;
}
/**
* True iff the given value is a UTF-16 surrogate.
*
* This function is intended for use in contexts where 32-bit values may need
* to be tested to see if they reside in the surrogate range, so it doesn't
* just take char16_t.
*/
inline bool
IsSurrogate(uint32_t codePoint)
{
return LeadSurrogateMin <= codePoint && codePoint <= TrailSurrogateMax;
}
inline char16_t
LeadSurrogate(uint32_t codePoint)
{