Bug 1478170 - Specialize SourceUnits::findWindowStart for UTF-8. r=arai

--HG--
extra : rebase_source : 6a9558e7fbf640c17cbbfb0e70d93ad24b3029ed
This commit is contained in:
Jeff Walden 2018-07-18 22:46:48 -07:00
parent 60c707a4bf
commit 6060208b38
2 changed files with 77 additions and 3 deletions

View File

@ -47,6 +47,7 @@ using mozilla::DecodeOneUtf8CodePoint;
using mozilla::IsAscii;
using mozilla::IsAsciiAlpha;
using mozilla::IsAsciiDigit;
using mozilla::IsTrailingUnit;
using mozilla::MakeScopeExit;
using mozilla::Maybe;
using mozilla::PointerRangeSize;
@ -938,6 +939,69 @@ SourceUnits<char16_t>::findWindowStart(size_t offset) const
return offset - HalfWindowSize();
}
template<>
size_t
SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const
{
// |offset| must be the location of the error or somewhere before it, so we
// know preceding data is valid UTF-8.
const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
const Utf8Unit* const initial = codeUnitPtrAt(offset);
const Utf8Unit* p = initial;
auto HalfWindowSize = [&p, &initial]() { return PointerRangeSize(p, initial); };
while (true) {
MOZ_ASSERT(earliestPossibleStart <= p);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius)
break;
// Peek backward for a line break, and only decrement if there is none.
uint8_t prev = p[-1].toUint8();
// First check for the ASCII LineTerminators.
if (prev == '\r' || prev == '\n')
break;
// Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
// (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there
// aren't three code units available, some comparison here will fail
// before we'd underflow.
if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
p[-2].toUint8() == 0x80 &&
p[-3].toUint8() == 0xE2))
{
break;
}
// Rewind over the non-LineTerminator. This can't underflow
// |earliestPossibleStart| because it begins a code point.
while (IsTrailingUnit(*--p))
continue;
MOZ_ASSERT(earliestPossibleStart <= p);
// But if we underflowed |WindowRadius|, adjust forward and stop.
if (HalfWindowSize() > WindowRadius) {
static_assert(WindowRadius > 3,
"skipping over non-lead code units below must not "
"advance past |offset|");
while (IsTrailingUnit(*++p))
continue;
MOZ_ASSERT(HalfWindowSize() < WindowRadius);
break;
}
}
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
return offset - HalfWindowSize();
}
template<>
size_t
SourceUnits<char16_t>::findWindowEnd(size_t offset) const

View File

@ -216,6 +216,16 @@ IsAscii(Utf8Unit aUnit)
extern MFBT_API bool
IsValidUtf8(const void* aCodeUnits, size_t aCount);
/**
* Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern
* 0b10xx'xxxx.
*/
inline bool
IsTrailingUnit(Utf8Unit aUnit)
{
return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000;
}
/**
* Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
* that (initially) itself points one unit past |aLeadUnit|, and
@ -299,11 +309,11 @@ DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
}
for (uint8_t i = 0; i < remaining; i++) {
uint8_t unit = Utf8Unit(*(*aIter)++).toUint8();
const Utf8Unit unit(*(*aIter)++);
// Every non-leading code unit in properly encoded UTF-8 has its high
// bit set and the next-highest bit unset.
if (MOZ_UNLIKELY((unit & 0b1100'0000) != 0b1000'0000)) {
if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) {
uint8_t unitsObserved = i + 1 + 1;
*aIter -= unitsObserved;
aOnBadTrailingUnit(unitsObserved);
@ -312,7 +322,7 @@ DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
// The code point being encoded is the concatenation of all the
// unconstrained bits.
n = (n << 6) | (unit & 0b0011'1111);
n = (n << 6) | (unit.toUint8() & 0b0011'1111);
}
// UTF-16 surrogates and values outside the Unicode range are invalid.