Bug 1779807 - Implement memchr64 in AVX2 r=iain

This only makes sense for AVX2, because widening it from a 64-bit comparison to a 128-bit comparison is hardly worth it, and there are gaps in the SSE2 instruction set (missing _mm_cmpeq_epi64, which is introduced in SSE4.1) that would require us to compensate and probably take a sizeable perf hit. Differential Revision: https://phabricator.services.mozilla.com/D152297
2024-11-24 13:21:05 +00:00 · 2022-07-25 21:21:37 +00:00 · 2022-07-25 21:21:37 +00:00 · 92a5c81749
commit 92a5c81749
parent 2a9b76d986
3 changed files with 283 additions and 140 deletions
--- a/mfbt/SIMD.cpp
+++ b/mfbt/SIMD.cpp
@ -14,6 +14,19 @@

 namespace mozilla {

+template <typename TValue>
+const TValue* FindInBufferNaive(const TValue* ptr, TValue value,
+                                size_t length) {
+  const TValue* end = ptr + length;
+  while (ptr < end) {
+    if (*ptr == value) {
+      return ptr;
+    }
+    ptr++;
+  }
+  return nullptr;
+}
+
 #ifdef MOZILLA_PRESUME_SSE2

 #  include <immintrin.h>
@ -41,22 +54,27 @@ uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; }

 uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); }

-template <typename CharType>
+template <typename TValue>
 __m128i CmpEq128(__m128i a, __m128i b) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  if (sizeof(CharType) == 1) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
+  if (sizeof(TValue) == 1) {
    return _mm_cmpeq_epi8(a, b);
  }
  return _mm_cmpeq_epi16(a, b);
 }

-template <typename CharType>
+template <typename TValue>
 __m256i CmpEq256(__m256i a, __m256i b) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  if (sizeof(CharType) == 1) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
+                sizeof(TValue) == 8);
+  if (sizeof(TValue) == 1) {
    return _mm256_cmpeq_epi8(a, b);
  }
-  return _mm256_cmpeq_epi16(a, b);
+  if (sizeof(TValue) == 2) {
+    return _mm256_cmpeq_epi16(a, b);
+  }
+
+  return _mm256_cmpeq_epi64(a, b);
 }

 #  ifdef __GNUC__
@ -116,17 +134,17 @@ const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c,
  return nullptr;
 }

-template <typename CharType>
-const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
-                              uintptr_t c, uintptr_t d) {
+template <typename TValue>
+const TValue* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
+                            uintptr_t c, uintptr_t d) {
  __m128i haystackA = _mm_loadu_si64(Cast128(a));
-  __m128i cmpA = CmpEq128<CharType>(needle, haystackA);
+  __m128i cmpA = CmpEq128<TValue>(needle, haystackA);
  __m128i haystackB = _mm_loadu_si64(Cast128(b));
-  __m128i cmpB = CmpEq128<CharType>(needle, haystackB);
+  __m128i cmpB = CmpEq128<TValue>(needle, haystackB);
  __m128i haystackC = _mm_loadu_si64(Cast128(c));
-  __m128i cmpC = CmpEq128<CharType>(needle, haystackC);
+  __m128i cmpC = CmpEq128<TValue>(needle, haystackC);
  __m128i haystackD = _mm_loadu_si64(Cast128(d));
-  __m128i cmpD = CmpEq128<CharType>(needle, haystackD);
+  __m128i cmpD = CmpEq128<TValue>(needle, haystackD);
  __m128i or_ab = _mm_or_si128(cmpA, cmpB);
  __m128i or_cd = _mm_or_si128(cmpC, cmpD);
  __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
@ -135,36 +153,36 @@ const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
    int cmpMask;
    cmpMask = _mm_movemask_epi8(cmpA);
    if (cmpMask & 0xff) {
-      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpB);
    if (cmpMask & 0xff) {
-      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpC);
    if (cmpMask & 0xff) {
-      return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpD);
    if (cmpMask & 0xff) {
-      return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
    }
  }

  return nullptr;
 }

-template <typename CharType>
-const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
-                               uintptr_t c, uintptr_t d) {
+template <typename TValue>
+const TValue* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
+                             uintptr_t c, uintptr_t d) {
  __m128i haystackA = _mm_loadu_si128(Cast128(a));
-  __m128i cmpA = CmpEq128<CharType>(needle, haystackA);
+  __m128i cmpA = CmpEq128<TValue>(needle, haystackA);
  __m128i haystackB = _mm_loadu_si128(Cast128(b));
-  __m128i cmpB = CmpEq128<CharType>(needle, haystackB);
+  __m128i cmpB = CmpEq128<TValue>(needle, haystackB);
  __m128i haystackC = _mm_loadu_si128(Cast128(c));
-  __m128i cmpC = CmpEq128<CharType>(needle, haystackC);
+  __m128i cmpC = CmpEq128<TValue>(needle, haystackC);
  __m128i haystackD = _mm_loadu_si128(Cast128(d));
-  __m128i cmpD = CmpEq128<CharType>(needle, haystackD);
+  __m128i cmpD = CmpEq128<TValue>(needle, haystackD);
  __m128i or_ab = _mm_or_si128(cmpA, cmpB);
  __m128i or_cd = _mm_or_si128(cmpC, cmpD);
  __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
@ -173,36 +191,36 @@ const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
    int cmpMask;
    cmpMask = _mm_movemask_epi8(cmpA);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpB);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpC);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpD);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
    }
  }

  return nullptr;
 }

-template <typename CharType>
-const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
-                               uintptr_t c, uintptr_t d) {
+template <typename TValue>
+const TValue* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
+                             uintptr_t c, uintptr_t d) {
  __m256i haystackA = _mm256_loadu_si256(Cast256(a));
-  __m256i cmpA = CmpEq256<CharType>(needle, haystackA);
+  __m256i cmpA = CmpEq256<TValue>(needle, haystackA);
  __m256i haystackB = _mm256_loadu_si256(Cast256(b));
-  __m256i cmpB = CmpEq256<CharType>(needle, haystackB);
+  __m256i cmpB = CmpEq256<TValue>(needle, haystackB);
  __m256i haystackC = _mm256_loadu_si256(Cast256(c));
-  __m256i cmpC = CmpEq256<CharType>(needle, haystackC);
+  __m256i cmpC = CmpEq256<TValue>(needle, haystackC);
  __m256i haystackD = _mm256_loadu_si256(Cast256(d));
-  __m256i cmpD = CmpEq256<CharType>(needle, haystackD);
+  __m256i cmpD = CmpEq256<TValue>(needle, haystackD);
  __m256i or_ab = _mm256_or_si256(cmpA, cmpB);
  __m256i or_cd = _mm256_or_si256(cmpC, cmpD);
  __m256i or_abcd = _mm256_or_si256(or_ab, or_cd);
@ -211,19 +229,19 @@ const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
    int cmpMask;
    cmpMask = _mm256_movemask_epi8(cmpA);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm256_movemask_epi8(cmpB);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm256_movemask_epi8(cmpC);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm256_movemask_epi8(cmpD);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
    }
  }

@ -243,15 +261,15 @@ enum class HaystackOverlap {
 // the next a's 16-byte chunk is needle2. `overlap` and whether
 // `carryIn`/`carryOut` are NULL should be knowable at compile time to avoid
 // branching.
-template <typename CharType>
-const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
-                                 uintptr_t b, __m128i* carryIn,
-                                 __m128i* carryOut, HaystackOverlap overlap) {
-  const int shiftRightAmount = 16 - sizeof(CharType);
-  const int shiftLeftAmount = sizeof(CharType);
+template <typename TValue>
+const TValue* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
+                               uintptr_t b, __m128i* carryIn, __m128i* carryOut,
+                               HaystackOverlap overlap) {
+  const int shiftRightAmount = 16 - sizeof(TValue);
+  const int shiftLeftAmount = sizeof(TValue);
  __m128i haystackA = _mm_loadu_si128(Cast128(a));
-  __m128i cmpA1 = CmpEq128<CharType>(needle1, haystackA);
-  __m128i cmpA2 = CmpEq128<CharType>(needle2, haystackA);
+  __m128i cmpA1 = CmpEq128<TValue>(needle1, haystackA);
+  __m128i cmpA2 = CmpEq128<TValue>(needle2, haystackA);
  __m128i cmpA;
  if (carryIn) {
    cmpA = _mm_and_si128(
@ -260,8 +278,8 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
    cmpA = _mm_and_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), cmpA2);
  }
  __m128i haystackB = _mm_loadu_si128(Cast128(b));
-  __m128i cmpB1 = CmpEq128<CharType>(needle1, haystackB);
-  __m128i cmpB2 = CmpEq128<CharType>(needle2, haystackB);
+  __m128i cmpB1 = CmpEq128<TValue>(needle1, haystackB);
+  __m128i cmpB2 = CmpEq128<TValue>(needle2, haystackB);
  __m128i cmpB;
  if (overlap == HaystackOverlap::Overlapping) {
    cmpB = _mm_and_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), cmpB2);
@ -277,13 +295,13 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
    int cmpMask;
    cmpMask = _mm_movemask_epi8(cmpA);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask) -
-                                               shiftLeftAmount);
+      return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask) -
+                                             shiftLeftAmount);
    }
    cmpMask = _mm_movemask_epi8(cmpB);
    if (cmpMask) {
-      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask) -
-                                               shiftLeftAmount);
+      return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask) -
+                                             shiftLeftAmount);
    }
  }

@ -294,13 +312,12 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
  return nullptr;
 }

-template <typename CharType>
-const CharType* FindInBuffer(const CharType* ptr, CharType value,
-                             size_t length) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  static_assert(std::is_unsigned<CharType>::value);
+template <typename TValue>
+const TValue* FindInBuffer(const TValue* ptr, TValue value, size_t length) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
+  static_assert(std::is_unsigned<TValue>::value);
  uint64_t splat64;
-  if (sizeof(CharType) == 1) {
+  if (sizeof(TValue) == 1) {
    splat64 = 0x0101010101010101llu;
  } else {
    splat64 = 0x0001000100010001llu;
@ -311,16 +328,16 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
  int64_t i64_value = *reinterpret_cast<int64_t*>(&u64_value);
  __m128i needle = _mm_set_epi64x(i64_value, i64_value);

-  size_t numBytes = length * sizeof(CharType);
+  size_t numBytes = length * sizeof(TValue);
  uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
  uintptr_t end = cur + numBytes;

-  if ((sizeof(CharType) > 1 && numBytes < 16) || numBytes < 4) {
+  if ((sizeof(TValue) > 1 && numBytes < 16) || numBytes < 4) {
    while (cur < end) {
-      if (GetAs<CharType>(cur) == value) {
-        return reinterpret_cast<const CharType*>(cur);
+      if (GetAs<TValue>(cur) == value) {
+        return reinterpret_cast<const TValue*>(cur);
      }
-      cur += sizeof(CharType);
+      cur += sizeof(TValue);
    }
    return nullptr;
  }
@ -340,9 +357,9 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
    uintptr_t c = end - 4 - ((numBytes & 8) >> 1);
    uintptr_t d = end - 4;
    const char* charResult = Check4x4Chars(needle, a, b, c, d);
-    // Note: we ensure above that sizeof(CharType) == 1 here, so this is
+    // Note: we ensure above that sizeof(TValue) == 1 here, so this is
    // either char to char or char to something like a uint8_t.
-    return reinterpret_cast<const CharType*>(charResult);
+    return reinterpret_cast<const TValue*>(charResult);
  }

  if (numBytes < 64) {
@ -352,17 +369,17 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
    uintptr_t b = cur + ((numBytes & 32) >> 1);
    uintptr_t c = end - 16 - ((numBytes & 32) >> 1);
    uintptr_t d = end - 16;
-    return Check4x16Bytes<CharType>(needle, a, b, c, d);
+    return Check4x16Bytes<TValue>(needle, a, b, c, d);
  }

  // Get the initial unaligned load out of the way. This will overlap with the
  // aligned stuff below, but the overlapped part should effectively be free
  // (relative to a mispredict from doing a byte-by-byte loop).
  __m128i haystack = _mm_loadu_si128(Cast128(cur));
-  __m128i cmp = CmpEq128<CharType>(needle, haystack);
+  __m128i cmp = CmpEq128<TValue>(needle, haystack);
  int cmpMask = _mm_movemask_epi8(cmp);
  if (cmpMask) {
-    return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
+    return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
  }

  // Now we're working with aligned memory. Hooray! \o/
@ -379,7 +396,7 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
    uintptr_t b = cur + 16;
    uintptr_t c = cur + 32;
    uintptr_t d = cur + 48;
-    const CharType* result = Check4x16Bytes<CharType>(needle, a, b, c, d);
+    const TValue* result = Check4x16Bytes<TValue>(needle, a, b, c, d);
    if (result) {
      return result;
    }
@ -390,49 +407,53 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
  uintptr_t b = tailStartPtr + 16;
  uintptr_t c = tailStartPtr + 32;
  uintptr_t d = tailEndPtr;
-  return Check4x16Bytes<CharType>(needle, a, b, c, d);
+  return Check4x16Bytes<TValue>(needle, a, b, c, d);
 }

-template <typename CharType>
-const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
-                                 size_t length) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  static_assert(std::is_unsigned<CharType>::value);
+template <typename TValue>
+const TValue* FindInBufferAVX2(const TValue* ptr, TValue value, size_t length) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
+                sizeof(TValue) == 8);
+  static_assert(std::is_unsigned<TValue>::value);

  // Load our needle into a 32-byte register
  __m256i needle;
-  if (sizeof(CharType) == 1) {
+  if (sizeof(TValue) == 1) {
    needle = _mm256_set1_epi8(value);
-  } else {
+  } else if (sizeof(TValue) == 2) {
    needle = _mm256_set1_epi16(value);
+  } else {
+    needle = _mm256_set1_epi64x(value);
  }

-  size_t numBytes = length * sizeof(CharType);
+  size_t numBytes = length * sizeof(TValue);
  uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
  uintptr_t end = cur + numBytes;

-  if (numBytes < 8) {
+  if (numBytes < 8 || (sizeof(TValue) == 8 && numBytes < 32)) {
    while (cur < end) {
-      if (GetAs<CharType>(cur) == value) {
-        return reinterpret_cast<const CharType*>(cur);
+      if (GetAs<TValue>(cur) == value) {
+        return reinterpret_cast<const TValue*>(cur);
      }
-      cur += sizeof(CharType);
+      cur += sizeof(TValue);
    }
    return nullptr;
  }

-  if (numBytes < 32) {
-    __m128i needle_narrow;
-    if (sizeof(CharType) == 1) {
-      needle_narrow = _mm_set1_epi8(value);
-    } else {
-      needle_narrow = _mm_set1_epi16(value);
+  if constexpr (sizeof(TValue) != 8) {
+    if (numBytes < 32) {
+      __m128i needle_narrow;
+      if (sizeof(TValue) == 1) {
+        needle_narrow = _mm_set1_epi8(value);
+      } else {
+        needle_narrow = _mm_set1_epi16(value);
+      }
+      uintptr_t a = cur;
+      uintptr_t b = cur + ((numBytes & 16) >> 1);
+      uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
+      uintptr_t d = end - 8;
+      return Check4x8Bytes<TValue>(needle_narrow, a, b, c, d);
    }
-    uintptr_t a = cur;
-    uintptr_t b = cur + ((numBytes & 16) >> 1);
-    uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
-    uintptr_t d = end - 8;
-    return Check4x8Bytes<CharType>(needle_narrow, a, b, c, d);
  }

  if (numBytes < 128) {
@ -442,17 +463,17 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
    uintptr_t b = cur + ((numBytes & 64) >> 1);
    uintptr_t c = end - 32 - ((numBytes & 64) >> 1);
    uintptr_t d = end - 32;
-    return Check4x32Bytes<CharType>(needle, a, b, c, d);
+    return Check4x32Bytes<TValue>(needle, a, b, c, d);
  }

  // Get the initial unaligned load out of the way. This will overlap with the
  // aligned stuff below, but the overlapped part should effectively be free
  // (relative to a mispredict from doing a byte-by-byte loop).
  __m256i haystack = _mm256_loadu_si256(Cast256(cur));
-  __m256i cmp = CmpEq256<CharType>(needle, haystack);
+  __m256i cmp = CmpEq256<TValue>(needle, haystack);
  int cmpMask = _mm256_movemask_epi8(cmp);
  if (cmpMask) {
-    return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
+    return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
  }

  // Now we're working with aligned memory. Hooray! \o/
@ -466,7 +487,7 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
    uintptr_t b = cur + 32;
    uintptr_t c = cur + 64;
    uintptr_t d = cur + 96;
-    const CharType* result = Check4x32Bytes<CharType>(needle, a, b, c, d);
+    const TValue* result = Check4x32Bytes<TValue>(needle, a, b, c, d);
    if (result) {
      return result;
    }
@ -477,12 +498,11 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
  uintptr_t b = tailStartPtr + 32;
  uintptr_t c = tailStartPtr + 64;
  uintptr_t d = tailEndPtr;
-  return Check4x32Bytes<CharType>(needle, a, b, c, d);
+  return Check4x32Bytes<TValue>(needle, a, b, c, d);
 }

-template <typename CharType>
-const CharType* TwoByteLoop(uintptr_t start, uintptr_t end, CharType v1,
-                            CharType v2);
+template <typename TValue>
+const TValue* TwoByteLoop(uintptr_t start, uintptr_t end, TValue v1, TValue v2);

 template <>
 const unsigned char* TwoByteLoop<unsigned char>(uintptr_t start, uintptr_t end,
@ -521,13 +541,13 @@ const char16_t* TwoByteLoop<char16_t>(uintptr_t start, uintptr_t end,
  return nullptr;
 }

-template <typename CharType>
-const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
-                                size_t length) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  static_assert(std::is_unsigned<CharType>::value);
+template <typename TValue>
+const TValue* FindTwoInBuffer(const TValue* ptr, TValue v1, TValue v2,
+                              size_t length) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
+  static_assert(std::is_unsigned<TValue>::value);
  uint64_t splat64;
-  if (sizeof(CharType) == 1) {
+  if (sizeof(TValue) == 1) {
    splat64 = 0x0101010101010101llu;
  } else {
    splat64 = 0x0001000100010001llu;
@ -541,33 +561,33 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
  int64_t i64_v2 = *reinterpret_cast<int64_t*>(&u64_v2);
  __m128i needle2 = _mm_set_epi64x(i64_v2, i64_v2);

-  size_t numBytes = length * sizeof(CharType);
+  size_t numBytes = length * sizeof(TValue);
  uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
  uintptr_t end = cur + numBytes;

  if (numBytes < 16) {
-    return TwoByteLoop<CharType>(cur, end, v1, v2);
+    return TwoByteLoop<TValue>(cur, end, v1, v2);
  }

  if (numBytes < 32) {
    uintptr_t a = cur;
    uintptr_t b = end - 16;
-    return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, nullptr, nullptr,
-                                      HaystackOverlap::Overlapping);
+    return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, nullptr, nullptr,
+                                    HaystackOverlap::Overlapping);
  }

  // Get the initial unaligned load out of the way. This will likely overlap
  // with the aligned stuff below, but the overlapped part should effectively
  // be free.
  __m128i haystack = _mm_loadu_si128(Cast128(cur));
-  __m128i cmp1 = CmpEq128<CharType>(needle1, haystack);
-  __m128i cmp2 = CmpEq128<CharType>(needle2, haystack);
+  __m128i cmp1 = CmpEq128<TValue>(needle1, haystack);
+  __m128i cmp2 = CmpEq128<TValue>(needle2, haystack);
  int cmpMask1 = _mm_movemask_epi8(cmp1);
  int cmpMask2 = _mm_movemask_epi8(cmp2);
-  int cmpMask = (cmpMask1 << sizeof(CharType)) & cmpMask2;
+  int cmpMask = (cmpMask1 << sizeof(TValue)) & cmpMask2;
  if (cmpMask) {
-    return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask) -
-                                             sizeof(CharType));
+    return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask) -
+                                           sizeof(TValue));
  }

  // Now we're working with aligned memory. Hooray! \o/
@ -583,9 +603,9 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
  while (cur < tailStartPtr) {
    uintptr_t a = cur;
    uintptr_t b = cur + 16;
-    const CharType* result =
-        Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
-                                   &cmpMaskCarry, HaystackOverlap::Sequential);
+    const TValue* result =
+        Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry,
+                                 &cmpMaskCarry, HaystackOverlap::Sequential);
    if (result) {
      return result;
    }
@ -597,8 +617,8 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
  cmpMaskCarry = _mm_and_si128(cmpMaskCarry, wideCarry);
  uintptr_t a = tailStartPtr;
  uintptr_t b = tailEndPtr;
-  return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
-                                    nullptr, HaystackOverlap::Overlapping);
+  return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry,
+                                  nullptr, HaystackOverlap::Overlapping);
 }

 const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
@ -635,6 +655,14 @@ const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
  return memchr16SSE2(ptr, value, length);
 }

+const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value,
+                               size_t length) {
+  if (supports_avx2()) {
+    return FindInBufferAVX2<uint64_t>(ptr, value, length);
+  }
+  return FindInBufferNaive<uint64_t>(ptr, value, length);
+}
+
 const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
  // Signed chars are just really annoying to do bit logic with. Convert to
  // unsigned at the outermost scope so we don't have to worry about it.
@ -667,14 +695,7 @@ const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {

 const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
                               size_t length) {
-  const char16_t* end = ptr + length;
-  while (ptr < end) {
-    if (*ptr == value) {
-      return ptr;
-    }
-    ptr++;
-  }
-  return nullptr;
+  return FindInBufferNaive<char16_t>(ptr, value, length);
 }

 const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
@ -682,6 +703,11 @@ const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
  return memchr16(ptr, value, length);
 }

+const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value,
+                               size_t length) {
+  return FindInBufferNaive<uint64_t>(ptr, value, length);
+}
+
 const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
  const char* end = ptr + length - 1;
  while (ptr < end) {
--- a/mfbt/SIMD.h
+++ b/mfbt/SIMD.h
@ -46,6 +46,11 @@ class SIMD {
  static MFBT_API const char16_t* memchr16SSE2(const char16_t* ptr,
                                               char16_t value, size_t length);

+  // Search through `ptr[0..length]` for the first occurrence of `value` and
+  // return the pointer to it, or nullptr if it cannot be found.
+  static MFBT_API const uint64_t* memchr64(const uint64_t* ptr, uint64_t value,
+                                           size_t length);
+
  // Search through `ptr[0..length]` for the first occurrence of `v1` which is
  // immediately followed by `v2` and return the pointer to the occurrence of
  // `v1`.
--- a/mfbt/tests/TestSIMD.cpp
+++ b/mfbt/tests/TestSIMD.cpp
@ -105,8 +105,8 @@ void TestLongString() {
    MOZ_RELEASE_ASSERT(
        SIMD::memchr8SSE2(test, static_cast<char>(i), count - 1) == test + i);
  }
-  MOZ_RELEASE_ASSERT(SIMD::memchr8(test, static_cast<char>(count - 1),
-                                       count - 1) == nullptr);
+  MOZ_RELEASE_ASSERT(
+      SIMD::memchr8(test, static_cast<char>(count - 1), count - 1) == nullptr);
 }

 void TestGauntlet() {
@ -124,8 +124,8 @@ void TestGauntlet() {
          if (j >= k && j < i) {
            expected = test + j;
          }
-          MOZ_RELEASE_ASSERT(SIMD::memchr8(test + k, static_cast<char>(j),
-                                               i - k) == expected);
+          MOZ_RELEASE_ASSERT(
+              SIMD::memchr8(test + k, static_cast<char>(j), i - k) == expected);
          MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test + k, static_cast<char>(j),
                                               i - k) == expected);
        }
@ -221,8 +221,8 @@ void TestLongString16() {
  }

  for (size_t i = 0; i < count - 1; ++i) {
-    MOZ_RELEASE_ASSERT(SIMD::memchr16(test, static_cast<char16_t>(i),
-                                          count - 1) == test + i);
+    MOZ_RELEASE_ASSERT(
+        SIMD::memchr16(test, static_cast<char16_t>(i), count - 1) == test + i);
    MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, static_cast<char16_t>(i),
                                          count - 1) == test + i);
  }
@ -245,9 +245,8 @@ void TestGauntlet16() {
          if (j >= k && j < i) {
            expected = test + j;
          }
-          MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k,
-                                                static_cast<char16_t>(j),
-                                                i - k) == expected);
+          MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k, static_cast<char16_t>(j),
+                                            i - k) == expected);
          MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test + k,
                                                static_cast<char16_t>(j),
                                                i - k) == expected);
@ -257,6 +256,113 @@ void TestGauntlet16() {
  }
 }

+void TestTinyString64() {
+  const uint64_t test[4] = {0, 1, 2, 3};
+
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 3) == test + 0x0);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 3) == test + 0x1);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 3) == test + 0x2);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 3) == nullptr);
+}
+
+void TestShortString64() {
+  const uint64_t test[16] = {0, 1, 2,  3,  4,  5,  6,  7,
+                             8, 9, 10, 11, 12, 13, 14, 15};
+
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 15) == test + 0);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 15) == test + 1);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 15) == test + 2);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 15) == test + 3);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 15) == test + 4);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 15) == test + 5);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 15) == test + 6);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 15) == test + 7);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 15) == test + 8);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 15) == test + 10);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 15) == test + 11);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 15) == test + 12);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 15) == test + 13);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 15) == test + 14);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 15) == nullptr);
+}
+
+void TestMediumString64() {
+  const uint64_t test[32] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
+                             11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                             22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 31) == test + 0);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 31) == test + 1);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 31) == test + 2);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 31) == test + 3);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 31) == test + 4);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 31) == test + 5);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 31) == test + 6);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 31) == test + 7);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 31) == test + 8);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 31) == test + 10);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 31) == test + 11);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 31) == test + 12);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 31) == test + 13);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 31) == test + 14);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 31) == test + 15);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 16, 31) == test + 16);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 17, 31) == test + 17);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 18, 31) == test + 18);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 19, 31) == test + 19);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 20, 31) == test + 20);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 21, 31) == test + 21);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 22, 31) == test + 22);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 23, 31) == test + 23);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 24, 31) == test + 24);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 25, 31) == test + 25);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 26, 31) == test + 26);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 27, 31) == test + 27);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 28, 31) == test + 28);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 29, 31) == test + 29);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 30, 31) == test + 30);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 31, 31) == nullptr);
+}
+
+void TestLongString64() {
+  const size_t count = 256;
+  uint64_t test[count];
+  for (size_t i = 0; i < count; ++i) {
+    test[i] = i;
+  }
+
+  for (uint64_t i = 0; i < count - 1; ++i) {
+    MOZ_RELEASE_ASSERT(SIMD::memchr64(test, i, count - 1) == test + i);
+  }
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, count - 1, count - 1) == nullptr);
+}
+
+void TestGauntlet64() {
+  const size_t count = 257;
+  uint64_t test[count];
+  for (size_t i = 0; i < count; ++i) {
+    test[i] = i;
+  }
+
+  for (uint64_t i = 0; i < count - 1; ++i) {
+    for (uint64_t j = 0; j < count - 1; ++j) {
+      for (uint64_t k = 0; k < count - 1; ++k) {
+        if (i >= k) {
+          const uint64_t* expected = nullptr;
+          if (j >= k && j < i) {
+            expected = test + j;
+          }
+          MOZ_RELEASE_ASSERT(SIMD::memchr64(test + k, j, i - k) == expected);
+        }
+      }
+    }
+  }
+}
+
 void TestTinyString2x8() {
  const char* test = "012\n";

@ -498,6 +604,12 @@ int main(void) {
  TestLongString16();
  TestGauntlet16();

+  TestTinyString64();
+  TestShortString64();
+  TestMediumString64();
+  TestLongString64();
+  TestGauntlet64();
+
  TestTinyString2x8();
  TestShortString2x8();
  TestMediumString2x8();