Bug 1478170 - Specialize SourceUnits::findWindowStart for UTF-8. r=arai

--HG-- extra : rebase_source : 6a9558e7fbf640c17cbbfb0e70d93ad24b3029ed
2024-11-28 07:13:20 +00:00 · 2018-07-18 22:46:48 -07:00 · 2018-07-18 22:46:48 -07:00 · 6060208b38
commit 6060208b38
parent 60c707a4bf
2 changed files with 77 additions and 3 deletions
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@ -47,6 +47,7 @@ using mozilla::DecodeOneUtf8CodePoint;
 using mozilla::IsAscii;
 using mozilla::IsAsciiAlpha;
 using mozilla::IsAsciiDigit;
+using mozilla::IsTrailingUnit;
 using mozilla::MakeScopeExit;
 using mozilla::Maybe;
 using mozilla::PointerRangeSize;
@ -938,6 +939,69 @@ SourceUnits<char16_t>::findWindowStart(size_t offset) const
    return offset - HalfWindowSize();
 }

+template<>
+size_t
+SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const
+{
+    // |offset| must be the location of the error or somewhere before it, so we
+    // know preceding data is valid UTF-8.
+
+    const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
+
+    const Utf8Unit* const initial = codeUnitPtrAt(offset);
+    const Utf8Unit* p = initial;
+
+    auto HalfWindowSize = [&p, &initial]() { return PointerRangeSize(p, initial); };
+
+    while (true) {
+        MOZ_ASSERT(earliestPossibleStart <= p);
+        MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+        if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius)
+            break;
+
+        // Peek backward for a line break, and only decrement if there is none.
+        uint8_t prev = p[-1].toUint8();
+
+        // First check for the ASCII LineTerminators.
+        if (prev == '\r' || prev == '\n')
+            break;
+
+        // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
+        // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9).  If there
+        // aren't three code units available, some comparison here will fail
+        // before we'd underflow.
+        if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
+                         p[-2].toUint8() == 0x80 &&
+                         p[-3].toUint8() == 0xE2))
+        {
+            break;
+        }
+
+        // Rewind over the non-LineTerminator.  This can't underflow
+        // |earliestPossibleStart| because it begins a code point.
+        while (IsTrailingUnit(*--p))
+            continue;
+
+        MOZ_ASSERT(earliestPossibleStart <= p);
+
+        // But if we underflowed |WindowRadius|, adjust forward and stop.
+        if (HalfWindowSize() > WindowRadius) {
+            static_assert(WindowRadius > 3,
+                          "skipping over non-lead code units below must not "
+                          "advance past |offset|");
+
+            while (IsTrailingUnit(*++p))
+                continue;
+
+            MOZ_ASSERT(HalfWindowSize() < WindowRadius);
+            break;
+        }
+    }
+
+    MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+    return offset - HalfWindowSize();
+}
+
 template<>
 size_t
 SourceUnits<char16_t>::findWindowEnd(size_t offset) const
--- a/mfbt/Utf8.h
+++ b/mfbt/Utf8.h
@ -216,6 +216,16 @@ IsAscii(Utf8Unit aUnit)
 extern MFBT_API bool
 IsValidUtf8(const void* aCodeUnits, size_t aCount);

+/**
+ * Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern
+ * 0b10xx'xxxx.
+ */
+inline bool
+IsTrailingUnit(Utf8Unit aUnit)
+{
+  return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000;
+}
+
 /**
 * Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
 * that (initially) itself points one unit past |aLeadUnit|, and
@ -299,11 +309,11 @@ DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
  }

  for (uint8_t i = 0; i < remaining; i++) {
-    uint8_t unit = Utf8Unit(*(*aIter)++).toUint8();
+    const Utf8Unit unit(*(*aIter)++);

    // Every non-leading code unit in properly encoded UTF-8 has its high
    // bit set and the next-highest bit unset.
-    if (MOZ_UNLIKELY((unit & 0b1100'0000) != 0b1000'0000)) {
+    if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) {
      uint8_t unitsObserved = i + 1 + 1;
      *aIter -= unitsObserved;
      aOnBadTrailingUnit(unitsObserved);
@ -312,7 +322,7 @@ DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,

    // The code point being encoded is the concatenation of all the
    // unconstrained bits.
-    n = (n << 6) | (unit & 0b0011'1111);
+    n = (n << 6) | (unit.toUint8() & 0b0011'1111);
  }

  // UTF-16 surrogates and values outside the Unicode range are invalid.