Bug 1551916 - Add a boolean to every chunk for a long-line vector indicating whether that chunk contains any multiple-unit code points, so that column computations inside wholly-single-unit chunks can do a constant-time pointer-range computation... r=arai

...and avoid iterating at all.  r=arai

Depends on D31302

Differential Revision: https://phabricator.services.mozilla.com/D31303

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Jeff Walden 2019-05-17 03:21:06 +00:00
parent d18c7116eb
commit 9c0ef78bad
2 changed files with 122 additions and 29 deletions

View File

@ -760,7 +760,8 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
// Compute and return the final column number from a partial offset/column,
// using the last-cached offset/column if they're more optimal.
auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset,
uint32_t partialCols) {
uint32_t partialCols,
UnitsType unitsType) {
MOZ_ASSERT(partialOffset <= offset);
// If the last lookup on this line was closer to |offset|, use it.
@ -773,8 +774,18 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
const Unit* end = sourceUnits.codeUnitPtrAt(offset);
partialOffset += PointerRangeSize(begin, end);
partialCols += AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
size_t offsetDelta = AssertedCast<uint32_t>(PointerRangeSize(begin, end));
partialOffset += offsetDelta;
if (unitsType == UnitsType::GuaranteedSingleUnit) {
MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
"guaranteed-single-units also guarantee pointer distance "
"equals code point count");
partialCols += offsetDelta;
} else {
partialCols +=
AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
}
this->lastOffsetOfComputedColumn_ = partialOffset;
this->lastComputedColumn_ = partialCols;
@ -783,15 +794,24 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
const uint32_t offsetInLine = offset - start;
// The index within a relevant |Vector<uint32_t>| of the nearest chunk
// info...if it's been computed at all.
// The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
// Compute the column from the start of the line if chunk information would
// direct us to the start of the line -- including if the line's too short to
// be chunked.
if (chunkIndex == 0) {
return ColumnFromPartial(start, 0);
// We don't know from an |offset| in the zeroth chunk that this line is even
// long. First-chunk info is mostly useless, anyway -- we have |start|
// already. So if we have *easy* access to that zeroth chunk, use it --
// otherwise just count pessimally. (This will still benefit from caching
// the last column/offset for computations for successive offsets, so it's
// not *always* worst-case.)
UnitsType unitsType;
if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0);
unitsType = (*lastChunkVectorForLine_)[0].unitsType();
} else {
unitsType = UnitsType::PossiblyMultiUnit;
}
return ColumnFromPartial(start, 0, unitsType);
}
// If this line has no chunk vector yet, insert one in the hash map. (The
@ -801,10 +821,10 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
if (!ptr) {
// This could rehash and invalidate a cached vector pointer, but the outer
// condition means we don't have a cached pointer.
if (!longLineColumnInfo_.add(ptr, line, Vector<uint32_t>(cx))) {
if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(cx))) {
// In case of OOM, just count columns from the start of the line.
cx->recoverFromOutOfMemory();
return ColumnFromPartial(start, 0);
return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit);
}
}
@ -828,19 +848,43 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
const Unit* actualPtr = naivePtr;
RetractPointerToCodePointBoundary(&actualPtr, limit);
# ifdef DEBUG
if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
UnitsType::GuaranteedSingleUnit) {
MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
}
# endif
return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
};
uint32_t partialOffset;
uint32_t partialColumn;
UnitsType unitsType;
auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
if (entriesLen <= chunkIndex) {
if (chunkIndex < entriesLen) {
// We've computed the chunk |offset| resides in. Compute the column number
// from the chunk.
partialOffset = RetractedOffsetOfChunk(chunkIndex);
partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column();
// This is exact if |chunkIndex| isn't the last chunk.
unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
// Otherwise the last chunk is pessimistically assumed to contain multi-unit
// code points because we haven't fully examined its contents yet -- they
// may not have been tokenized yet, they could contain encoding errors, or
// they might not even exist.
MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
(*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
UnitsType::PossiblyMultiUnit);
} else {
// Extend the vector from its last entry or the start of the line. (This is
// also a suitable partial start point if we must recover from OOM.)
if (entriesLen > 0) {
partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1];
partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column();
} else {
partialOffset = start;
partialColumn = 0;
@ -849,21 +893,24 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
// As earlier, just start from the greatest offset/column in case of OOM.
cx->recoverFromOutOfMemory();
return ColumnFromPartial(partialOffset, partialColumn);
return ColumnFromPartial(partialOffset, partialColumn,
UnitsType::PossiblyMultiUnit);
}
// OOM is no longer possible now. \o/
// The vector always begins with the column of the line start, i.e. zero.
// The vector always begins with the column of the line start, i.e. zero,
// with chunk units pessimally assumed not single-unit.
if (entriesLen == 0) {
lastChunkVectorForLine_->infallibleAppend(0);
lastChunkVectorForLine_->infallibleAppend(
ChunkInfo(0, UnitsType::PossiblyMultiUnit));
entriesLen++;
}
do {
const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
start + std::min(entriesLen * ColumnChunkLength, offsetInLine));
start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
MOZ_ASSERT(begin < chunkLimit);
MOZ_ASSERT(chunkLimit <= limit);
@ -880,18 +927,28 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
MOZ_ASSERT(begin < chunkLimit);
MOZ_ASSERT(chunkLimit <= limit);
partialOffset += PointerRangeSize(begin, chunkLimit);
partialColumn += unicode::CountCodePoints(begin, chunkLimit);
size_t numUnits = PointerRangeSize(begin, chunkLimit);
size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);
lastChunkVectorForLine_->infallibleAppend(partialColumn);
entriesLen++;
// If this chunk (which will become non-final at the end of the loop) is
// all single-unit code points, annotate the chunk accordingly.
if (numUnits == numCodePoints) {
lastChunkVectorForLine_->back().guaranteeSingleUnits();
}
partialOffset += numUnits;
partialColumn += numCodePoints;
lastChunkVectorForLine_->infallibleEmplaceBack(
partialColumn, UnitsType::PossiblyMultiUnit);
} while (entriesLen < chunkIndex + 1);
} else {
partialOffset = RetractedOffsetOfChunk(chunkIndex);
partialColumn = (*lastChunkVectorForLine_)[chunkIndex];
// We're at a spot in the current final chunk, and final chunks never have
// complete units information, so be pessimistic.
unitsType = UnitsType::PossiblyMultiUnit;
}
return ColumnFromPartial(partialOffset, partialColumn);
return ColumnFromPartial(partialOffset, partialColumn, unitsType);
}
#endif // JS_COLUMN_DIMENSION_IS_CODE_POINTS()

View File

@ -754,6 +754,42 @@ class TokenStreamAnyChars : public TokenStreamShared {
#if JS_COLUMN_DIMENSION_IS_CODE_POINTS()
static constexpr uint32_t ColumnChunkLength = 128;
enum class UnitsType : unsigned char{
PossiblyMultiUnit = 0,
GuaranteedSingleUnit = 1,
};
class ChunkInfo {
public:
ChunkInfo(uint32_t col, UnitsType type)
: unitsType_(static_cast<unsigned char>(type)) {
memcpy(column_, &col, sizeof(col));
}
uint32_t column() const {
uint32_t col;
memcpy(&col, column_, sizeof(uint32_t));
return col;
}
UnitsType unitsType() const {
MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
return static_cast<UnitsType>(unitsType_);
}
void guaranteeSingleUnits() {
MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
"should only be setting to possibly optimize from the "
"pessimistic case");
unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
}
private:
// Store everything in |unsigned char|s so everything packs.
unsigned char column_[sizeof(uint32_t)];
unsigned char unitsType_;
};
/**
* Line number (of lines at least |ColumnChunkLength| code units long) to
* a sequence of the column numbers at |ColumnChunkLength| boundaries rewound
@ -763,7 +799,7 @@ class TokenStreamAnyChars : public TokenStreamShared {
* distance is performed on a line, and the vectors are lazily filled as
* greater offsets within lines require column computations.
*/
mutable HashMap<uint32_t, Vector<uint32_t>> longLineColumnInfo_;
mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;
#endif // JS_COLUMN_DIMENSION_IS_CODE_POINTS()
protected:
@ -823,13 +859,13 @@ class TokenStreamAnyChars : public TokenStreamShared {
// the common line prefix.
//
// Additionally, we avoid hash table lookup costs by caching the
// |Vector<uint32_t>*| for the line of the last lookup. (|nullptr| means we
// |Vector<ChunkInfo>*| for the line of the last lookup. (|nullptr| means we
// have to look it up -- or it hasn't been created yet.) This pointer is
// invalidated when a lookup on a new line occurs, but as it's not a pointer
// at literal element data, it's *not* invalidated when new entries are added
// to such a vector.
mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;
mutable Vector<uint32_t>* lastChunkVectorForLine_ = nullptr;
mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;
mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;
mutable uint32_t lastComputedColumn_ = 0;
#endif // JS_COLUMN_DIMENSION_IS_CODE_POINTS()