mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-28 07:13:20 +00:00
Bug 1478170 - Specialize SourceUnits::findWindowStart for UTF-8. r=arai
--HG-- extra : rebase_source : 6a9558e7fbf640c17cbbfb0e70d93ad24b3029ed
This commit is contained in:
parent
60c707a4bf
commit
6060208b38
@ -47,6 +47,7 @@ using mozilla::DecodeOneUtf8CodePoint;
|
||||
using mozilla::IsAscii;
|
||||
using mozilla::IsAsciiAlpha;
|
||||
using mozilla::IsAsciiDigit;
|
||||
using mozilla::IsTrailingUnit;
|
||||
using mozilla::MakeScopeExit;
|
||||
using mozilla::Maybe;
|
||||
using mozilla::PointerRangeSize;
|
||||
@ -938,6 +939,69 @@ SourceUnits<char16_t>::findWindowStart(size_t offset) const
|
||||
return offset - HalfWindowSize();
|
||||
}
|
||||
|
||||
template<>
|
||||
size_t
|
||||
SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const
|
||||
{
|
||||
// |offset| must be the location of the error or somewhere before it, so we
|
||||
// know preceding data is valid UTF-8.
|
||||
|
||||
const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
|
||||
|
||||
const Utf8Unit* const initial = codeUnitPtrAt(offset);
|
||||
const Utf8Unit* p = initial;
|
||||
|
||||
auto HalfWindowSize = [&p, &initial]() { return PointerRangeSize(p, initial); };
|
||||
|
||||
while (true) {
|
||||
MOZ_ASSERT(earliestPossibleStart <= p);
|
||||
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
|
||||
if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius)
|
||||
break;
|
||||
|
||||
// Peek backward for a line break, and only decrement if there is none.
|
||||
uint8_t prev = p[-1].toUint8();
|
||||
|
||||
// First check for the ASCII LineTerminators.
|
||||
if (prev == '\r' || prev == '\n')
|
||||
break;
|
||||
|
||||
// Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
|
||||
// (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there
|
||||
// aren't three code units available, some comparison here will fail
|
||||
// before we'd underflow.
|
||||
if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
|
||||
p[-2].toUint8() == 0x80 &&
|
||||
p[-3].toUint8() == 0xE2))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// Rewind over the non-LineTerminator. This can't underflow
|
||||
// |earliestPossibleStart| because it begins a code point.
|
||||
while (IsTrailingUnit(*--p))
|
||||
continue;
|
||||
|
||||
MOZ_ASSERT(earliestPossibleStart <= p);
|
||||
|
||||
// But if we underflowed |WindowRadius|, adjust forward and stop.
|
||||
if (HalfWindowSize() > WindowRadius) {
|
||||
static_assert(WindowRadius > 3,
|
||||
"skipping over non-lead code units below must not "
|
||||
"advance past |offset|");
|
||||
|
||||
while (IsTrailingUnit(*++p))
|
||||
continue;
|
||||
|
||||
MOZ_ASSERT(HalfWindowSize() < WindowRadius);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
|
||||
return offset - HalfWindowSize();
|
||||
}
|
||||
|
||||
template<>
|
||||
size_t
|
||||
SourceUnits<char16_t>::findWindowEnd(size_t offset) const
|
||||
|
16
mfbt/Utf8.h
16
mfbt/Utf8.h
@ -216,6 +216,16 @@ IsAscii(Utf8Unit aUnit)
|
||||
extern MFBT_API bool
|
||||
IsValidUtf8(const void* aCodeUnits, size_t aCount);
|
||||
|
||||
/**
|
||||
* Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern
|
||||
* 0b10xx'xxxx.
|
||||
*/
|
||||
inline bool
|
||||
IsTrailingUnit(Utf8Unit aUnit)
|
||||
{
|
||||
return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
|
||||
* that (initially) itself points one unit past |aLeadUnit|, and
|
||||
@ -299,11 +309,11 @@ DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
|
||||
}
|
||||
|
||||
for (uint8_t i = 0; i < remaining; i++) {
|
||||
uint8_t unit = Utf8Unit(*(*aIter)++).toUint8();
|
||||
const Utf8Unit unit(*(*aIter)++);
|
||||
|
||||
// Every non-leading code unit in properly encoded UTF-8 has its high
|
||||
// bit set and the next-highest bit unset.
|
||||
if (MOZ_UNLIKELY((unit & 0b1100'0000) != 0b1000'0000)) {
|
||||
if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) {
|
||||
uint8_t unitsObserved = i + 1 + 1;
|
||||
*aIter -= unitsObserved;
|
||||
aOnBadTrailingUnit(unitsObserved);
|
||||
@ -312,7 +322,7 @@ DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
|
||||
|
||||
// The code point being encoded is the concatenation of all the
|
||||
// unconstrained bits.
|
||||
n = (n << 6) | (unit & 0b0011'1111);
|
||||
n = (n << 6) | (unit.toUint8() & 0b0011'1111);
|
||||
}
|
||||
|
||||
// UTF-16 surrogates and values outside the Unicode range are invalid.
|
||||
|
Loading…
Reference in New Issue
Block a user