mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-28 20:55:39 +00:00
Bug 1478170 - Implement getNonAsciiCodePointDontNormalize for UTF-8. r=arai
--HG-- extra : rebase_source : ab9e28fc001eaab9af3bcb072ce783a88d0f7f07
This commit is contained in:
parent
ab4f689aa3
commit
cf12902522
@ -571,6 +571,193 @@ SourceUnits<char16_t>::assertNextCodePoint(const PeekedCodePoint<char16_t>& peek
|
||||
|
||||
#endif // DEBUG
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
MOZ_COLD void
|
||||
TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(uint8_t relevantUnits,
|
||||
unsigned errorNumber, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, errorNumber);
|
||||
|
||||
do {
|
||||
size_t offset = this->sourceUnits.offset();
|
||||
|
||||
ErrorMetadata err;
|
||||
|
||||
TokenStreamAnyChars& anyChars = anyCharsAccess();
|
||||
|
||||
if (bool hasLineOfContext = anyChars.fillExcludingContext(&err, offset)) {
|
||||
if (!internalComputeLineOfContext(&err, offset))
|
||||
break;
|
||||
|
||||
// As this is an encoding error, the computed window-end must be
|
||||
// identical to the location of the error -- any further on and the
|
||||
// window would contain invalid Unicode.
|
||||
MOZ_ASSERT_IF(err.lineOfContext != nullptr,
|
||||
err.lineLength == err.tokenOffset);
|
||||
}
|
||||
|
||||
auto notes = MakeUnique<JSErrorNotes>();
|
||||
if (!notes) {
|
||||
ReportOutOfMemory(anyChars.cx);
|
||||
break;
|
||||
}
|
||||
|
||||
// The largest encoding of a UTF-8 code point is 4 units. (Encoding an
|
||||
// obsolete 5- or 6-byte code point will complain only about a bad lead
|
||||
// code unit.)
|
||||
constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
|
||||
|
||||
MOZ_ASSERT(relevantUnits > 0);
|
||||
|
||||
char badUnitsStr[MaxWidth];
|
||||
char* ptr = badUnitsStr;
|
||||
while (relevantUnits > 0) {
|
||||
byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
|
||||
ptr[4] = ' ';
|
||||
|
||||
ptr += 5;
|
||||
relevantUnits--;
|
||||
}
|
||||
|
||||
ptr[-1] = '\0';
|
||||
|
||||
uint32_t line, column;
|
||||
anyChars.srcCoords.lineNumAndColumnIndex(offset, &line, &column);
|
||||
|
||||
if (!notes->addNoteASCII(anyChars.cx, anyChars.getFilename(), line, column,
|
||||
GetErrorMessage, nullptr, JSMSG_BAD_CODE_UNITS, badUnitsStr))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
ReportCompileError(anyChars.cx, std::move(err), std::move(notes), JSREPORT_ERROR,
|
||||
errorNumber, args);
|
||||
} while (false);
|
||||
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
MOZ_COLD void
|
||||
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(Utf8Unit lead)
|
||||
{
|
||||
uint8_t leadValue = lead.toUint8();
|
||||
|
||||
char leadByteStr[5];
|
||||
byteToTerminatedString(leadValue, leadByteStr);
|
||||
|
||||
internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
|
||||
}
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
MOZ_COLD void
|
||||
TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(Utf8Unit lead,
|
||||
uint8_t remaining, uint8_t required)
|
||||
{
|
||||
uint8_t leadValue = lead.toUint8();
|
||||
|
||||
MOZ_ASSERT(required == 2 || required == 3 || required == 4);
|
||||
MOZ_ASSERT(remaining < 4);
|
||||
MOZ_ASSERT(remaining < required);
|
||||
|
||||
char leadByteStr[5];
|
||||
byteToTerminatedString(leadValue, leadByteStr);
|
||||
|
||||
// |toHexChar| produces the desired decimal numbers for values < 4.
|
||||
const char expectedStr[] = { toHexChar(required - 1), '\0' };
|
||||
const char actualStr[] = { toHexChar(remaining - 1), '\0' };
|
||||
|
||||
internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS,
|
||||
leadByteStr, expectedStr, actualStr, remaining == 2 ? " was" : "s were");
|
||||
}
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
MOZ_COLD void
|
||||
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(Utf8Unit badUnit,
|
||||
uint8_t unitsObserved)
|
||||
{
|
||||
char badByteStr[5];
|
||||
byteToTerminatedString(badUnit.toUint8(), badByteStr);
|
||||
|
||||
internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT, badByteStr);
|
||||
}
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
MOZ_COLD void
|
||||
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(uint32_t codePoint,
|
||||
uint8_t codePointLength,
|
||||
const char* reason)
|
||||
{
|
||||
// Construct a string like "0x203D" (including null terminator) to include
|
||||
// in the error message. Write the string end-to-start from end to start
|
||||
// of an adequately sized |char| array, shifting least significant nibbles
|
||||
// off the number and writing the corresponding hex digits until done, then
|
||||
// prefixing with "0x". |codePointStr| points at the incrementally
|
||||
// computed string, within |codePointCharsArray|'s bounds.
|
||||
|
||||
// 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
|
||||
// bits in a four-byte UTF-8 code unit sequence.
|
||||
constexpr size_t MaxHexSize = sizeof("0x1F" "FFFF"); // including '\0'
|
||||
char codePointCharsArray[MaxHexSize];
|
||||
|
||||
char* codePointStr = codePointCharsArray + ArrayLength(codePointCharsArray);
|
||||
*--codePointStr = '\0';
|
||||
|
||||
uint32_t copy = codePoint;
|
||||
while (copy) {
|
||||
MOZ_ASSERT(codePointCharsArray < codePointStr);
|
||||
*--codePointStr = toHexChar(copy & 0xF);
|
||||
copy >>= 4;
|
||||
}
|
||||
|
||||
MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
|
||||
*--codePointStr = 'x';
|
||||
*--codePointStr = '0';
|
||||
|
||||
internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT, codePointStr, reason);
|
||||
}
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
MOZ_MUST_USE bool
|
||||
TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(Utf8Unit lead,
|
||||
char32_t* codePoint)
|
||||
{
|
||||
auto onBadLeadUnit = [this, &lead]() {
|
||||
this->badLeadUnit(lead);
|
||||
};
|
||||
|
||||
auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
|
||||
this->notEnoughUnits(lead, remaining, required);
|
||||
};
|
||||
|
||||
auto onBadTrailingUnit = [this, &lead](uint8_t unitsObserved) {
|
||||
this->badTrailingUnit(lead, unitsObserved);
|
||||
};
|
||||
|
||||
auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
|
||||
this->badCodePoint(badCodePoint, unitsObserved);
|
||||
};
|
||||
|
||||
auto onNotShortestForm = [this](char32_t badCodePoint, uint8_t unitsObserved) {
|
||||
this->notShortestForm(badCodePoint, unitsObserved);
|
||||
};
|
||||
|
||||
// If a valid code point is decoded, this function call consumes its code
|
||||
// units. If not, it ungets the lead code unit and invokes the right error
|
||||
// handler, so on failure we must immediately return false.
|
||||
SourceUnitsIterator iter(this->sourceUnits);
|
||||
Maybe<char32_t> maybeCodePoint =
|
||||
DecodeOneUtf8CodePointInline(lead, &iter, SourceUnitsEnd(),
|
||||
onBadLeadUnit, onNotEnoughUnits, onBadTrailingUnit,
|
||||
onBadCodePoint, onNotShortestForm);
|
||||
if (maybeCodePoint.isNothing())
|
||||
return false;
|
||||
|
||||
*codePoint = maybeCodePoint.value();
|
||||
return true;
|
||||
}
|
||||
|
||||
template<class AnyCharsAccess>
|
||||
bool
|
||||
TokenStreamChars<char16_t, AnyCharsAccess>::getCodePoint(int32_t* cp)
|
||||
|
@ -1163,6 +1163,10 @@ class SourceUnits
|
||||
return base_ + (offset - startOffset_);
|
||||
}
|
||||
|
||||
const CharT* current() const {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
const CharT* limit() const {
|
||||
return limit_;
|
||||
}
|
||||
@ -1649,6 +1653,88 @@ class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
|
||||
protected:
|
||||
// These APIs are only usable by UTF-8-specific code.
|
||||
|
||||
using typename CharsBase::SourceUnits;
|
||||
|
||||
/**
|
||||
* A mutable iterator-wrapper around |SourceUnits| that translates
|
||||
* operators to calls to |SourceUnits::getCodeUnit()| and similar.
|
||||
*
|
||||
* This class is expected to be used in concert with |SourceUnitsEnd|.
|
||||
*/
|
||||
class SourceUnitsIterator
|
||||
{
|
||||
SourceUnits& sourceUnits_;
|
||||
#ifdef DEBUG
|
||||
// In iterator copies created by the post-increment operator, a pointer
|
||||
// at the next source text code unit when the post-increment operator
|
||||
// was called, cleared when the iterator is dereferenced.
|
||||
mutable mozilla::Maybe<const mozilla::Utf8Unit*> currentBeforePostIncrement_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
explicit SourceUnitsIterator(SourceUnits& sourceUnits)
|
||||
: sourceUnits_(sourceUnits)
|
||||
{}
|
||||
|
||||
mozilla::Utf8Unit operator*() const {
|
||||
// operator* is expected to get the *next* value from an iterator
|
||||
// not pointing at the end of the underlying range. However, the
|
||||
// sole use of this is in the context of an expression of the form
|
||||
// |*iter++|, that performed the |sourceUnits_.getCodeUnit()| in
|
||||
// the |operator++(int)| below -- so dereferencing acts on a
|
||||
// |sourceUnits_| already advanced. Therefore the correct unit to
|
||||
// return is the previous one.
|
||||
MOZ_ASSERT(currentBeforePostIncrement_.value() + 1 == sourceUnits_.current());
|
||||
#ifdef DEBUG
|
||||
currentBeforePostIncrement_.reset();
|
||||
#endif
|
||||
return sourceUnits_.previousCodeUnit();
|
||||
}
|
||||
|
||||
SourceUnitsIterator operator++(int) {
|
||||
MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
|
||||
"the only valid operation on a post-incremented "
|
||||
"iterator is dereferencing a single time");
|
||||
|
||||
SourceUnitsIterator copy = *this;
|
||||
#ifdef DEBUG
|
||||
copy.currentBeforePostIncrement_.emplace(sourceUnits_.current());
|
||||
#endif
|
||||
|
||||
sourceUnits_.getCodeUnit();
|
||||
return copy;
|
||||
}
|
||||
|
||||
void operator-=(size_t n) {
|
||||
MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
|
||||
"the only valid operation on a post-incremented "
|
||||
"iterator is dereferencing a single time");
|
||||
sourceUnits_.unskipCodeUnits(n);
|
||||
}
|
||||
|
||||
mozilla::Utf8Unit operator[](ptrdiff_t index) {
|
||||
MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
|
||||
"the only valid operation on a post-incremented "
|
||||
"iterator is dereferencing a single time");
|
||||
MOZ_ASSERT(index == -1,
|
||||
"must only be called to verify the value of the "
|
||||
"previous code unit");
|
||||
return sourceUnits_.previousCodeUnit();
|
||||
}
|
||||
|
||||
size_t remaining() const {
|
||||
MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
|
||||
"the only valid operation on a post-incremented "
|
||||
"iterator is dereferencing a single time");
|
||||
return sourceUnits_.remaining();
|
||||
}
|
||||
};
|
||||
|
||||
/** A sentinel representing the end of |SourceUnits| data. */
|
||||
class SourceUnitsEnd {};
|
||||
|
||||
friend inline size_t operator-(const SourceUnitsEnd& aEnd, const SourceUnitsIterator& aIter);
|
||||
|
||||
protected:
|
||||
// These APIs are in both SpecializedTokenStreamCharsBase specializations
|
||||
// and so are usable in subclasses no matter what CharT is.
|
||||
@ -1656,6 +1742,13 @@ class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
|
||||
using CharsBase::CharsBase;
|
||||
};
|
||||
|
||||
inline size_t
|
||||
operator-(const SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>::SourceUnitsEnd& aEnd,
|
||||
const SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>::SourceUnitsIterator& aIter)
|
||||
{
|
||||
return aIter.remaining();
|
||||
}
|
||||
|
||||
/** A small class encapsulating computation of the start-offset of a Token. */
|
||||
class TokenStart
|
||||
{
|
||||
@ -1963,11 +2056,122 @@ class TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>
|
||||
using GeneralCharsBase = GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
|
||||
using Self = TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
|
||||
|
||||
using typename SpecializedCharsBase::SourceUnitsEnd;
|
||||
using typename SpecializedCharsBase::SourceUnitsIterator;
|
||||
|
||||
protected:
|
||||
using GeneralCharsBase::anyCharsAccess;
|
||||
using GeneralCharsBase::internalComputeLineOfContext;
|
||||
using TokenStreamCharsShared::isAsciiCodePoint;
|
||||
// Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
|
||||
|
||||
private:
|
||||
static char toHexChar(uint8_t nibble) {
|
||||
MOZ_ASSERT(nibble < 16);
|
||||
return "0123456789ABCDEF"[nibble];
|
||||
}
|
||||
|
||||
static void byteToString(uint8_t n, char* str) {
|
||||
str[0] = '0';
|
||||
str[1] = 'x';
|
||||
str[2] = toHexChar(n >> 4);
|
||||
str[3] = toHexChar(n & 0xF);
|
||||
}
|
||||
|
||||
static void byteToTerminatedString(uint8_t n, char* str) {
|
||||
byteToString(n, str);
|
||||
str[4] = '\0';
|
||||
}
|
||||
|
||||
/**
|
||||
* Report a UTF-8 encoding-related error for a code point starting AT THE
|
||||
* CURRENT OFFSET.
|
||||
*
|
||||
* |relevantUnits| indicates how many code units from the current offset
|
||||
* are potentially relevant to the reported error, such that they may be
|
||||
* included in the error message. For example, if at the current offset we
|
||||
* have
|
||||
*
|
||||
* 0b1111'1111 ...
|
||||
*
|
||||
* a code unit never allowed in UTF-8, then |relevantUnits| might be 1
|
||||
* because only that unit is relevant. Or if we have
|
||||
*
|
||||
* 0b1111'0111 0b1011'0101 0b0000'0000 ...
|
||||
*
|
||||
* where the first two code units are a valid prefix to a four-unit code
|
||||
* point but the third unit *isn't* a valid trailing code unit, then
|
||||
* |relevantUnits| might be 3.
|
||||
*/
|
||||
MOZ_COLD void internalEncodingError(uint8_t relevantUnits, unsigned errorNumber, ...);
|
||||
|
||||
// Don't use |internalEncodingError|! Use one of the elaborated functions
|
||||
// that calls it, below -- all of which should be used to indicate an error
|
||||
// in a code point starting AT THE CURRENT OFFSET as with
|
||||
// |internalEncodingError|.
|
||||
|
||||
/** Report an error for an invalid lead code unit |lead|. */
|
||||
MOZ_COLD void badLeadUnit(mozilla::Utf8Unit lead);
|
||||
|
||||
/**
|
||||
* Report an error when there aren't enough code units remaining to
|
||||
* constitute a full code point after |lead|: only |remaining| code units
|
||||
* were available for a code point starting with |lead|, when at least
|
||||
* |required| code units were required.
|
||||
*/
|
||||
MOZ_COLD void notEnoughUnits(mozilla::Utf8Unit lead, uint8_t remaining, uint8_t required);
|
||||
|
||||
/**
|
||||
* Report an error for a bad trailing UTF-8 code unit, where the bad
|
||||
* trailing unit was the last of |unitsObserved| units examined from the
|
||||
* current offset.
|
||||
*/
|
||||
MOZ_COLD void badTrailingUnit(mozilla::Utf8Unit badUnit, uint8_t unitsObserved);
|
||||
|
||||
// Helper used for both |badCodePoint| and |notShortestForm| for code units
|
||||
// that have all the requisite high bits set/unset in a manner that *could*
|
||||
// encode a valid code point, but the remaining bits encoding its actual
|
||||
// value do not define a permitted value.
|
||||
MOZ_COLD void badStructurallyValidCodePoint(uint32_t codePoint, uint8_t codePointLength,
|
||||
const char* reason);
|
||||
|
||||
/**
|
||||
* Report an error for UTF-8 that encodes a UTF-16 surrogate or a number
|
||||
* outside the Unicode range.
|
||||
*/
|
||||
MOZ_COLD void badCodePoint(uint32_t codePoint, uint8_t codePointLength) {
|
||||
MOZ_ASSERT(unicode::IsSurrogate(codePoint) || codePoint > unicode::NonBMPMax);
|
||||
|
||||
badStructurallyValidCodePoint(codePoint, codePointLength,
|
||||
unicode::IsSurrogate(codePoint)
|
||||
? "it's a UTF-16 surrogate"
|
||||
: "the maximum code point is U+10FFFF");
|
||||
}
|
||||
|
||||
/**
|
||||
* Report an error for UTF-8 that encodes a code point not in its shortest
|
||||
* form.
|
||||
*/
|
||||
MOZ_COLD void notShortestForm(uint32_t codePoint, uint8_t codePointLength) {
|
||||
MOZ_ASSERT(!unicode::IsSurrogate(codePoint));
|
||||
MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
|
||||
|
||||
badStructurallyValidCodePoint(codePoint, codePointLength,
|
||||
"it wasn't encoded in shortest possible form");
|
||||
}
|
||||
|
||||
protected:
|
||||
using GeneralCharsBase::GeneralCharsBase;
|
||||
|
||||
/**
|
||||
* Given the non-ASCII |lead| code unit just consumed, consume the rest of
|
||||
* a non-ASCII code point. The code point is not normalized: on success
|
||||
* |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
|
||||
*
|
||||
* Report an error if an invalid code point is encountered.
|
||||
*/
|
||||
MOZ_MUST_USE bool
|
||||
getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead, char32_t* codePoint);
|
||||
};
|
||||
|
||||
// TokenStream is the lexical scanner for JavaScript source text.
|
||||
|
@ -352,6 +352,13 @@ MSG_DEF(JSMSG_DEFAULT_IN_PATTERN, 0, JSEXN_SYNTAXERR, "destructuring defaul
|
||||
MSG_DEF(JSMSG_BAD_NEWTARGET, 0, JSEXN_SYNTAXERR, "new.target only allowed within functions")
|
||||
MSG_DEF(JSMSG_ESCAPED_KEYWORD, 0, JSEXN_SYNTAXERR, "keywords must be written literally, without embedded escapes")
|
||||
|
||||
// UTF-8 source text encoding errors
|
||||
MSG_DEF(JSMSG_BAD_LEADING_UTF8_UNIT, 1, JSEXN_SYNTAXERR, "{0} byte doesn't begin a valid UTF-8 code point")
|
||||
MSG_DEF(JSMSG_NOT_ENOUGH_CODE_UNITS, 4, JSEXN_SYNTAXERR, "{0} byte in UTF-8 must be followed by {1} bytes, but {2} byte{3} present")
|
||||
MSG_DEF(JSMSG_BAD_TRAILING_UTF8_UNIT, 1, JSEXN_SYNTAXERR, "bad trailing UTF-8 byte {0} doesn't match the pattern 0b10xxxxxx")
|
||||
MSG_DEF(JSMSG_FORBIDDEN_UTF8_CODE_POINT,2,JSEXN_SYNTAXERR, "{0} isn't a valid code point because {1}")
|
||||
MSG_DEF(JSMSG_BAD_CODE_UNITS, 1, JSEXN_NOTE, "the code units comprising this invalid code point were: {0}")
|
||||
|
||||
// asm.js
|
||||
MSG_DEF(JSMSG_USE_ASM_TYPE_FAIL, 1, JSEXN_TYPEERR, "asm.js type error: {0}")
|
||||
MSG_DEF(JSMSG_USE_ASM_LINK_FAIL, 1, JSEXN_TYPEERR, "asm.js link error: {0}")
|
||||
|
@ -582,6 +582,19 @@ IsTrailSurrogate(uint32_t codePoint)
|
||||
return codePoint >= TrailSurrogateMin && codePoint <= TrailSurrogateMax;
|
||||
}
|
||||
|
||||
/**
|
||||
* True iff the given value is a UTF-16 surrogate.
|
||||
*
|
||||
* This function is intended for use in contexts where 32-bit values may need
|
||||
* to be tested to see if they reside in the surrogate range, so it doesn't
|
||||
* just take char16_t.
|
||||
*/
|
||||
inline bool
|
||||
IsSurrogate(uint32_t codePoint)
|
||||
{
|
||||
return LeadSurrogateMin <= codePoint && codePoint <= TrailSurrogateMax;
|
||||
}
|
||||
|
||||
inline char16_t
|
||||
LeadSurrogate(uint32_t codePoint)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user