Bug 1551916 - Add a boolean to every chunk for a long-line vector indicating whether that chunk contains any multiple-unit code points, so that column computations inside wholly-single-unit chunks can do a constant-time pointer-range computation... r=arai

...and avoid iterating at all. r=arai Depends on D31302 Differential Revision: https://phabricator.services.mozilla.com/D31303 --HG-- extra : moz-landing-system : lando
2024-10-21 01:05:45 +00:00 · 2019-05-17 03:21:06 +00:00 · 2019-05-17 03:21:06 +00:00 · 9c0ef78bad
commit 9c0ef78bad
parent d18c7116eb
2 changed files with 122 additions and 29 deletions
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@ -760,7 +760,8 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
  // Compute and return the final column number from a partial offset/column,
  // using the last-cached offset/column if they're more optimal.
  auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset,
-                                                        uint32_t partialCols) {
+                                                        uint32_t partialCols,
+                                                        UnitsType unitsType) {
    MOZ_ASSERT(partialOffset <= offset);

    // If the last lookup on this line was closer to |offset|, use it.
@ -773,8 +774,18 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
    const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
    const Unit* end = sourceUnits.codeUnitPtrAt(offset);

-    partialOffset += PointerRangeSize(begin, end);
-    partialCols += AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
+    size_t offsetDelta = AssertedCast<uint32_t>(PointerRangeSize(begin, end));
+    partialOffset += offsetDelta;
+
+    if (unitsType == UnitsType::GuaranteedSingleUnit) {
+      MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
+                 "guaranteed-single-units also guarantee pointer distance "
+                 "equals code point count");
+      partialCols += offsetDelta;
+    } else {
+      partialCols +=
+          AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
+    }

    this->lastOffsetOfComputedColumn_ = partialOffset;
    this->lastComputedColumn_ = partialCols;
@ -783,15 +794,24 @@ uint32_t TokenStreamAnyChars::computePartialColumn(

  const uint32_t offsetInLine = offset - start;

-  // The index within a relevant |Vector<uint32_t>| of the nearest chunk
-  // info...if it's been computed at all.
+  // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
  const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
-
-  // Compute the column from the start of the line if chunk information would
-  // direct us to the start of the line -- including if the line's too short to
-  // be chunked.
  if (chunkIndex == 0) {
-    return ColumnFromPartial(start, 0);
+    // We don't know from an |offset| in the zeroth chunk that this line is even
+    // long.  First-chunk info is mostly useless, anyway -- we have |start|
+    // already.  So if we have *easy* access to that zeroth chunk, use it --
+    // otherwise just count pessimally.  (This will still benefit from caching
+    // the last column/offset for computations for successive offsets, so it's
+    // not *always* worst-case.)
+    UnitsType unitsType;
+    if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
+      MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0);
+      unitsType = (*lastChunkVectorForLine_)[0].unitsType();
+    } else {
+      unitsType = UnitsType::PossiblyMultiUnit;
+    }
+
+    return ColumnFromPartial(start, 0, unitsType);
  }

  // If this line has no chunk vector yet, insert one in the hash map.  (The
@ -801,10 +821,10 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
    if (!ptr) {
      // This could rehash and invalidate a cached vector pointer, but the outer
      // condition means we don't have a cached pointer.
-      if (!longLineColumnInfo_.add(ptr, line, Vector<uint32_t>(cx))) {
+      if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(cx))) {
        // In case of OOM, just count columns from the start of the line.
        cx->recoverFromOutOfMemory();
-        return ColumnFromPartial(start, 0);
+        return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit);
      }
    }

@ -828,19 +848,43 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
    const Unit* actualPtr = naivePtr;
    RetractPointerToCodePointBoundary(&actualPtr, limit);

+#  ifdef DEBUG
+    if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
+        UnitsType::GuaranteedSingleUnit) {
+      MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
+    }
+#  endif
+
    return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
  };

  uint32_t partialOffset;
  uint32_t partialColumn;
+  UnitsType unitsType;

  auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
-  if (entriesLen <= chunkIndex) {
+  if (chunkIndex < entriesLen) {
+    // We've computed the chunk |offset| resides in.  Compute the column number
+    // from the chunk.
+    partialOffset = RetractedOffsetOfChunk(chunkIndex);
+    partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column();
+
+    // This is exact if |chunkIndex| isn't the last chunk.
+    unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
+
+    // Otherwise the last chunk is pessimistically assumed to contain multi-unit
+    // code points because we haven't fully examined its contents yet -- they
+    // may not have been tokenized yet, they could contain encoding errors, or
+    // they might not even exist.
+    MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
+                  (*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
+                      UnitsType::PossiblyMultiUnit);
+  } else {
    // Extend the vector from its last entry or the start of the line.  (This is
    // also a suitable partial start point if we must recover from OOM.)
    if (entriesLen > 0) {
      partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
-      partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1];
+      partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column();
    } else {
      partialOffset = start;
      partialColumn = 0;
@ -849,21 +893,24 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
    if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
      // As earlier, just start from the greatest offset/column in case of OOM.
      cx->recoverFromOutOfMemory();
-      return ColumnFromPartial(partialOffset, partialColumn);
+      return ColumnFromPartial(partialOffset, partialColumn,
+                               UnitsType::PossiblyMultiUnit);
    }

    // OOM is no longer possible now.  \o/

-    // The vector always begins with the column of the line start, i.e. zero.
+    // The vector always begins with the column of the line start, i.e. zero,
+    // with chunk units pessimally assumed not single-unit.
    if (entriesLen == 0) {
-      lastChunkVectorForLine_->infallibleAppend(0);
+      lastChunkVectorForLine_->infallibleAppend(
+          ChunkInfo(0, UnitsType::PossiblyMultiUnit));
      entriesLen++;
    }

    do {
      const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
      const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
-          start + std::min(entriesLen * ColumnChunkLength, offsetInLine));
+          start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));

      MOZ_ASSERT(begin < chunkLimit);
      MOZ_ASSERT(chunkLimit <= limit);
@ -880,18 +927,28 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
      MOZ_ASSERT(begin < chunkLimit);
      MOZ_ASSERT(chunkLimit <= limit);

-      partialOffset += PointerRangeSize(begin, chunkLimit);
-      partialColumn += unicode::CountCodePoints(begin, chunkLimit);
+      size_t numUnits = PointerRangeSize(begin, chunkLimit);
+      size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);

-      lastChunkVectorForLine_->infallibleAppend(partialColumn);
-      entriesLen++;
+      // If this chunk (which will become non-final at the end of the loop) is
+      // all single-unit code points, annotate the chunk accordingly.
+      if (numUnits == numCodePoints) {
+        lastChunkVectorForLine_->back().guaranteeSingleUnits();
+      }
+
+      partialOffset += numUnits;
+      partialColumn += numCodePoints;
+
+      lastChunkVectorForLine_->infallibleEmplaceBack(
+          partialColumn, UnitsType::PossiblyMultiUnit);
    } while (entriesLen < chunkIndex + 1);
-  } else {
-    partialOffset = RetractedOffsetOfChunk(chunkIndex);
-    partialColumn = (*lastChunkVectorForLine_)[chunkIndex];
+
+    // We're at a spot in the current final chunk, and final chunks never have
+    // complete units information, so be pessimistic.
+    unitsType = UnitsType::PossiblyMultiUnit;
  }

-  return ColumnFromPartial(partialOffset, partialColumn);
+  return ColumnFromPartial(partialOffset, partialColumn, unitsType);
 }

 #endif  // JS_COLUMN_DIMENSION_IS_CODE_POINTS()
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@ -754,6 +754,42 @@ class TokenStreamAnyChars : public TokenStreamShared {
 #if JS_COLUMN_DIMENSION_IS_CODE_POINTS()
  static constexpr uint32_t ColumnChunkLength = 128;

+  enum class UnitsType : unsigned char{
+      PossiblyMultiUnit = 0,
+      GuaranteedSingleUnit = 1,
+  };
+
+  class ChunkInfo {
+   public:
+    ChunkInfo(uint32_t col, UnitsType type)
+        : unitsType_(static_cast<unsigned char>(type)) {
+      memcpy(column_, &col, sizeof(col));
+    }
+
+    uint32_t column() const {
+      uint32_t col;
+      memcpy(&col, column_, sizeof(uint32_t));
+      return col;
+    }
+
+    UnitsType unitsType() const {
+      MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
+      return static_cast<UnitsType>(unitsType_);
+    }
+
+    void guaranteeSingleUnits() {
+      MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
+                 "should only be setting to possibly optimize from the "
+                 "pessimistic case");
+      unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
+    }
+
+   private:
+    // Store everything in |unsigned char|s so everything packs.
+    unsigned char column_[sizeof(uint32_t)];
+    unsigned char unitsType_;
+  };
+
  /**
   * Line number (of lines at least |ColumnChunkLength| code units long) to
   * a sequence of the column numbers at |ColumnChunkLength| boundaries rewound
@ -763,7 +799,7 @@ class TokenStreamAnyChars : public TokenStreamShared {
   * distance is performed on a line, and the vectors are lazily filled as
   * greater offsets within lines require column computations.
   */
-  mutable HashMap<uint32_t, Vector<uint32_t>> longLineColumnInfo_;
+  mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;
 #endif  // JS_COLUMN_DIMENSION_IS_CODE_POINTS()

 protected:
@ -823,13 +859,13 @@ class TokenStreamAnyChars : public TokenStreamShared {
  // the common line prefix.
  //
  // Additionally, we avoid hash table lookup costs by caching the
-  // |Vector<uint32_t>*| for the line of the last lookup.  (|nullptr| means we
+  // |Vector<ChunkInfo>*| for the line of the last lookup.  (|nullptr| means we
  // have to look it up -- or it hasn't been created yet.)  This pointer is
  // invalidated when a lookup on a new line occurs, but as it's not a pointer
  // at literal element data, it's *not* invalidated when new entries are added
  // to such a vector.
  mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;
-  mutable Vector<uint32_t>* lastChunkVectorForLine_ = nullptr;
+  mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;
  mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;
  mutable uint32_t lastComputedColumn_ = 0;
 #endif  // JS_COLUMN_DIMENSION_IS_CODE_POINTS()