From c2cde6897f804081462a7c5f1f1d1242c373c4d5 Mon Sep 17 00:00:00 2001
From: Doug Thayer <dothayer@mozilla.com>
Date: Fri, 29 Jul 2022 03:26:06 +0000
Subject: [PATCH] Bug 1779807 - Implement memchr64 in AVX2 r=iain

This only makes sense for AVX2, because widening it from a 64-bit comparison
to a 128-bit comparison is hardly worth it, and there are gaps in the SSE2
instruction set (missing _mm_cmpeq_epi64, which is introduced in SSE4.1) that
would require us to compensate and probably take a sizeable perf hit.

Differential Revision: https://phabricator.services.mozilla.com/D152297
---
 mfbt/SIMD.cpp           | 286 ++++++++++++++++++++++------------------
 mfbt/SIMD.h             |   5 +
 mfbt/tests/TestSIMD.cpp | 130 ++++++++++++++++--
 3 files changed, 282 insertions(+), 139 deletions(-)
diff --git a/mfbt/SIMD.cpp b/mfbt/SIMD.cpp
index 9d2bdffb7b28..1f9c5217f0a8 100644
--- a/mfbt/SIMD.cpp
+++ b/mfbt/SIMD.cpp
@@ -14,6 +14,19 @@
 
 namespace mozilla {
 
+template <typename TValue>
+const TValue* FindInBufferNaive(const TValue* ptr, TValue value,
+                                size_t length) {
+  const TValue* end = ptr + length;
+  while (ptr < end) {
+    if (*ptr == value) {
+      return ptr;
+    }
+    ptr++;
+  }
+  return nullptr;
+}
+
 #ifdef MOZILLA_PRESUME_SSE2
 
 #  include <immintrin.h>
@@ -41,22 +54,27 @@ uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; }
 
 uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); }
 
-template <typename CharType>
+template <typename TValue>
 __m128i CmpEq128(__m128i a, __m128i b) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  if (sizeof(CharType) == 1) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
+  if (sizeof(TValue) == 1) {
     return _mm_cmpeq_epi8(a, b);
   }
   return _mm_cmpeq_epi16(a, b);
 }
 
-template <typename CharType>
+template <typename TValue>
 __m256i CmpEq256(__m256i a, __m256i b) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  if (sizeof(CharType) == 1) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
+                sizeof(TValue) == 8);
+  if (sizeof(TValue) == 1) {
     return _mm256_cmpeq_epi8(a, b);
   }
-  return _mm256_cmpeq_epi16(a, b);
+  if (sizeof(TValue) == 2) {
+    return _mm256_cmpeq_epi16(a, b);
+  }
+
+  return _mm256_cmpeq_epi64(a, b);
 }
 
 #  ifdef __GNUC__
@@ -128,17 +146,17 @@ const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c,
   return nullptr;
 }
 
-template <typename CharType>
-const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
+template <typename TValue>
+const TValue* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
                             uintptr_t c, uintptr_t d) {
   __m128i haystackA = Load64BitsIntoXMM(a);
-  __m128i cmpA = CmpEq128<CharType>(needle, haystackA);
+  __m128i cmpA = CmpEq128<TValue>(needle, haystackA);
   __m128i haystackB = Load64BitsIntoXMM(b);
-  __m128i cmpB = CmpEq128<CharType>(needle, haystackB);
+  __m128i cmpB = CmpEq128<TValue>(needle, haystackB);
   __m128i haystackC = Load64BitsIntoXMM(c);
-  __m128i cmpC = CmpEq128<CharType>(needle, haystackC);
+  __m128i cmpC = CmpEq128<TValue>(needle, haystackC);
   __m128i haystackD = Load64BitsIntoXMM(d);
-  __m128i cmpD = CmpEq128<CharType>(needle, haystackD);
+  __m128i cmpD = CmpEq128<TValue>(needle, haystackD);
   __m128i or_ab = _mm_or_si128(cmpA, cmpB);
   __m128i or_cd = _mm_or_si128(cmpC, cmpD);
   __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
@@ -147,36 +165,36 @@ const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
     int cmpMask;
     cmpMask = _mm_movemask_epi8(cmpA);
     if (cmpMask & 0xff) {
-      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm_movemask_epi8(cmpB);
     if (cmpMask & 0xff) {
-      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm_movemask_epi8(cmpC);
     if (cmpMask & 0xff) {
-      return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm_movemask_epi8(cmpD);
     if (cmpMask & 0xff) {
-      return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
     }
   }
 
   return nullptr;
 }
 
-template <typename CharType>
-const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
-                               uintptr_t c, uintptr_t d) {
+template <typename TValue>
+const TValue* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
+                             uintptr_t c, uintptr_t d) {
   __m128i haystackA = _mm_loadu_si128(Cast128(a));
-  __m128i cmpA = CmpEq128<CharType>(needle, haystackA);
+  __m128i cmpA = CmpEq128<TValue>(needle, haystackA);
   __m128i haystackB = _mm_loadu_si128(Cast128(b));
-  __m128i cmpB = CmpEq128<CharType>(needle, haystackB);
+  __m128i cmpB = CmpEq128<TValue>(needle, haystackB);
   __m128i haystackC = _mm_loadu_si128(Cast128(c));
-  __m128i cmpC = CmpEq128<CharType>(needle, haystackC);
+  __m128i cmpC = CmpEq128<TValue>(needle, haystackC);
   __m128i haystackD = _mm_loadu_si128(Cast128(d));
-  __m128i cmpD = CmpEq128<CharType>(needle, haystackD);
+  __m128i cmpD = CmpEq128<TValue>(needle, haystackD);
   __m128i or_ab = _mm_or_si128(cmpA, cmpB);
   __m128i or_cd = _mm_or_si128(cmpC, cmpD);
   __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
@@ -185,36 +203,36 @@ const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
     int cmpMask;
     cmpMask = _mm_movemask_epi8(cmpA);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm_movemask_epi8(cmpB);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm_movemask_epi8(cmpC);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm_movemask_epi8(cmpD);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
     }
   }
 
   return nullptr;
 }
 
-template <typename CharType>
-const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
-                               uintptr_t c, uintptr_t d) {
+template <typename TValue>
+const TValue* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
+                             uintptr_t c, uintptr_t d) {
   __m256i haystackA = _mm256_loadu_si256(Cast256(a));
-  __m256i cmpA = CmpEq256<CharType>(needle, haystackA);
+  __m256i cmpA = CmpEq256<TValue>(needle, haystackA);
   __m256i haystackB = _mm256_loadu_si256(Cast256(b));
-  __m256i cmpB = CmpEq256<CharType>(needle, haystackB);
+  __m256i cmpB = CmpEq256<TValue>(needle, haystackB);
   __m256i haystackC = _mm256_loadu_si256(Cast256(c));
-  __m256i cmpC = CmpEq256<CharType>(needle, haystackC);
+  __m256i cmpC = CmpEq256<TValue>(needle, haystackC);
   __m256i haystackD = _mm256_loadu_si256(Cast256(d));
-  __m256i cmpD = CmpEq256<CharType>(needle, haystackD);
+  __m256i cmpD = CmpEq256<TValue>(needle, haystackD);
   __m256i or_ab = _mm256_or_si256(cmpA, cmpB);
   __m256i or_cd = _mm256_or_si256(cmpC, cmpD);
   __m256i or_abcd = _mm256_or_si256(or_ab, or_cd);
@@ -223,19 +241,19 @@ const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
     int cmpMask;
     cmpMask = _mm256_movemask_epi8(cmpA);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm256_movemask_epi8(cmpB);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm256_movemask_epi8(cmpC);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
     }
     cmpMask = _mm256_movemask_epi8(cmpD);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
+      return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
     }
   }
 
@@ -255,15 +273,15 @@ enum class HaystackOverlap {
 // the next a's 16-byte chunk is needle2. `overlap` and whether
 // `carryIn`/`carryOut` are NULL should be knowable at compile time to avoid
 // branching.
-template <typename CharType>
-const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
-                                 uintptr_t b, __m128i* carryIn,
-                                 __m128i* carryOut, HaystackOverlap overlap) {
-  const int shiftRightAmount = 16 - sizeof(CharType);
-  const int shiftLeftAmount = sizeof(CharType);
+template <typename TValue>
+const TValue* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
+                               uintptr_t b, __m128i* carryIn, __m128i* carryOut,
+                               HaystackOverlap overlap) {
+  const int shiftRightAmount = 16 - sizeof(TValue);
+  const int shiftLeftAmount = sizeof(TValue);
   __m128i haystackA = _mm_loadu_si128(Cast128(a));
-  __m128i cmpA1 = CmpEq128<CharType>(needle1, haystackA);
-  __m128i cmpA2 = CmpEq128<CharType>(needle2, haystackA);
+  __m128i cmpA1 = CmpEq128<TValue>(needle1, haystackA);
+  __m128i cmpA2 = CmpEq128<TValue>(needle2, haystackA);
   __m128i cmpA;
   if (carryIn) {
     cmpA = _mm_and_si128(
@@ -272,8 +290,8 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
     cmpA = _mm_and_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), cmpA2);
   }
   __m128i haystackB = _mm_loadu_si128(Cast128(b));
-  __m128i cmpB1 = CmpEq128<CharType>(needle1, haystackB);
-  __m128i cmpB2 = CmpEq128<CharType>(needle2, haystackB);
+  __m128i cmpB1 = CmpEq128<TValue>(needle1, haystackB);
+  __m128i cmpB2 = CmpEq128<TValue>(needle2, haystackB);
   __m128i cmpB;
   if (overlap == HaystackOverlap::Overlapping) {
     cmpB = _mm_and_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), cmpB2);
@@ -289,13 +307,13 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
     int cmpMask;
     cmpMask = _mm_movemask_epi8(cmpA);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask) -
-                                               shiftLeftAmount);
+      return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask) -
+                                             shiftLeftAmount);
     }
     cmpMask = _mm_movemask_epi8(cmpB);
     if (cmpMask) {
-      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask) -
-                                               shiftLeftAmount);
+      return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask) -
+                                             shiftLeftAmount);
     }
   }
 
@@ -306,13 +324,12 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
   return nullptr;
 }
 
-template <typename CharType>
-const CharType* FindInBuffer(const CharType* ptr, CharType value,
-                             size_t length) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  static_assert(std::is_unsigned<CharType>::value);
+template <typename TValue>
+const TValue* FindInBuffer(const TValue* ptr, TValue value, size_t length) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
+  static_assert(std::is_unsigned<TValue>::value);
   uint64_t splat64;
-  if (sizeof(CharType) == 1) {
+  if (sizeof(TValue) == 1) {
     splat64 = 0x0101010101010101llu;
   } else {
     splat64 = 0x0001000100010001llu;
@@ -323,16 +340,16 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
   int64_t i64_value = *reinterpret_cast<int64_t*>(&u64_value);
   __m128i needle = _mm_set_epi64x(i64_value, i64_value);
 
-  size_t numBytes = length * sizeof(CharType);
+  size_t numBytes = length * sizeof(TValue);
   uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
   uintptr_t end = cur + numBytes;
 
-  if ((sizeof(CharType) > 1 && numBytes < 16) || numBytes < 4) {
+  if ((sizeof(TValue) > 1 && numBytes < 16) || numBytes < 4) {
     while (cur < end) {
-      if (GetAs<CharType>(cur) == value) {
-        return reinterpret_cast<const CharType*>(cur);
+      if (GetAs<TValue>(cur) == value) {
+        return reinterpret_cast<const TValue*>(cur);
       }
-      cur += sizeof(CharType);
+      cur += sizeof(TValue);
     }
     return nullptr;
   }
@@ -352,9 +369,9 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
     uintptr_t c = end - 4 - ((numBytes & 8) >> 1);
     uintptr_t d = end - 4;
     const char* charResult = Check4x4Chars(needle, a, b, c, d);
-    // Note: we ensure above that sizeof(CharType) == 1 here, so this is
+    // Note: we ensure above that sizeof(TValue) == 1 here, so this is
     // either char to char or char to something like a uint8_t.
-    return reinterpret_cast<const CharType*>(charResult);
+    return reinterpret_cast<const TValue*>(charResult);
   }
 
   if (numBytes < 64) {
@@ -364,17 +381,17 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
     uintptr_t b = cur + ((numBytes & 32) >> 1);
     uintptr_t c = end - 16 - ((numBytes & 32) >> 1);
     uintptr_t d = end - 16;
-    return Check4x16Bytes<CharType>(needle, a, b, c, d);
+    return Check4x16Bytes<TValue>(needle, a, b, c, d);
   }
 
   // Get the initial unaligned load out of the way. This will overlap with the
   // aligned stuff below, but the overlapped part should effectively be free
   // (relative to a mispredict from doing a byte-by-byte loop).
   __m128i haystack = _mm_loadu_si128(Cast128(cur));
-  __m128i cmp = CmpEq128<CharType>(needle, haystack);
+  __m128i cmp = CmpEq128<TValue>(needle, haystack);
   int cmpMask = _mm_movemask_epi8(cmp);
   if (cmpMask) {
-    return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
+    return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
   }
 
   // Now we're working with aligned memory. Hooray! \o/
@@ -391,7 +408,7 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
     uintptr_t b = cur + 16;
     uintptr_t c = cur + 32;
     uintptr_t d = cur + 48;
-    const CharType* result = Check4x16Bytes<CharType>(needle, a, b, c, d);
+    const TValue* result = Check4x16Bytes<TValue>(needle, a, b, c, d);
     if (result) {
       return result;
     }
@@ -402,49 +419,53 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
   uintptr_t b = tailStartPtr + 16;
   uintptr_t c = tailStartPtr + 32;
   uintptr_t d = tailEndPtr;
-  return Check4x16Bytes<CharType>(needle, a, b, c, d);
+  return Check4x16Bytes<TValue>(needle, a, b, c, d);
 }
 
-template <typename CharType>
-const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
-                                 size_t length) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  static_assert(std::is_unsigned<CharType>::value);
+template <typename TValue>
+const TValue* FindInBufferAVX2(const TValue* ptr, TValue value, size_t length) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
+                sizeof(TValue) == 8);
+  static_assert(std::is_unsigned<TValue>::value);
 
   // Load our needle into a 32-byte register
   __m256i needle;
-  if (sizeof(CharType) == 1) {
+  if (sizeof(TValue) == 1) {
     needle = _mm256_set1_epi8(value);
-  } else {
+  } else if (sizeof(TValue) == 2) {
     needle = _mm256_set1_epi16(value);
+  } else {
+    needle = _mm256_set1_epi64x(value);
   }
 
-  size_t numBytes = length * sizeof(CharType);
+  size_t numBytes = length * sizeof(TValue);
   uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
   uintptr_t end = cur + numBytes;
 
-  if (numBytes < 8) {
+  if (numBytes < 8 || (sizeof(TValue) == 8 && numBytes < 32)) {
     while (cur < end) {
-      if (GetAs<CharType>(cur) == value) {
-        return reinterpret_cast<const CharType*>(cur);
+      if (GetAs<TValue>(cur) == value) {
+        return reinterpret_cast<const TValue*>(cur);
       }
-      cur += sizeof(CharType);
+      cur += sizeof(TValue);
     }
     return nullptr;
   }
 
-  if (numBytes < 32) {
-    __m128i needle_narrow;
-    if (sizeof(CharType) == 1) {
-      needle_narrow = _mm_set1_epi8(value);
-    } else {
-      needle_narrow = _mm_set1_epi16(value);
+  if constexpr (sizeof(TValue) != 8) {
+    if (numBytes < 32) {
+      __m128i needle_narrow;
+      if (sizeof(TValue) == 1) {
+        needle_narrow = _mm_set1_epi8(value);
+      } else {
+        needle_narrow = _mm_set1_epi16(value);
+      }
+      uintptr_t a = cur;
+      uintptr_t b = cur + ((numBytes & 16) >> 1);
+      uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
+      uintptr_t d = end - 8;
+      return Check4x8Bytes<TValue>(needle_narrow, a, b, c, d);
     }
-    uintptr_t a = cur;
-    uintptr_t b = cur + ((numBytes & 16) >> 1);
-    uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
-    uintptr_t d = end - 8;
-    return Check4x8Bytes<CharType>(needle_narrow, a, b, c, d);
   }
 
   if (numBytes < 128) {
@@ -454,17 +475,17 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
     uintptr_t b = cur + ((numBytes & 64) >> 1);
     uintptr_t c = end - 32 - ((numBytes & 64) >> 1);
     uintptr_t d = end - 32;
-    return Check4x32Bytes<CharType>(needle, a, b, c, d);
+    return Check4x32Bytes<TValue>(needle, a, b, c, d);
   }
 
   // Get the initial unaligned load out of the way. This will overlap with the
   // aligned stuff below, but the overlapped part should effectively be free
   // (relative to a mispredict from doing a byte-by-byte loop).
   __m256i haystack = _mm256_loadu_si256(Cast256(cur));
-  __m256i cmp = CmpEq256<CharType>(needle, haystack);
+  __m256i cmp = CmpEq256<TValue>(needle, haystack);
   int cmpMask = _mm256_movemask_epi8(cmp);
   if (cmpMask) {
-    return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
+    return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
   }
 
   // Now we're working with aligned memory. Hooray! \o/
@@ -478,7 +499,7 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
     uintptr_t b = cur + 32;
     uintptr_t c = cur + 64;
     uintptr_t d = cur + 96;
-    const CharType* result = Check4x32Bytes<CharType>(needle, a, b, c, d);
+    const TValue* result = Check4x32Bytes<TValue>(needle, a, b, c, d);
     if (result) {
       return result;
     }
@@ -489,12 +510,11 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
   uintptr_t b = tailStartPtr + 32;
   uintptr_t c = tailStartPtr + 64;
   uintptr_t d = tailEndPtr;
-  return Check4x32Bytes<CharType>(needle, a, b, c, d);
+  return Check4x32Bytes<TValue>(needle, a, b, c, d);
 }
 
-template <typename CharType>
-const CharType* TwoByteLoop(uintptr_t start, uintptr_t end, CharType v1,
-                            CharType v2);
+template <typename TValue>
+const TValue* TwoByteLoop(uintptr_t start, uintptr_t end, TValue v1, TValue v2);
 
 template <>
 const unsigned char* TwoByteLoop<unsigned char>(uintptr_t start, uintptr_t end,
@@ -533,13 +553,13 @@ const char16_t* TwoByteLoop<char16_t>(uintptr_t start, uintptr_t end,
   return nullptr;
 }
 
-template <typename CharType>
-const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
-                                size_t length) {
-  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
-  static_assert(std::is_unsigned<CharType>::value);
+template <typename TValue>
+const TValue* FindTwoInBuffer(const TValue* ptr, TValue v1, TValue v2,
+                              size_t length) {
+  static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
+  static_assert(std::is_unsigned<TValue>::value);
   uint64_t splat64;
-  if (sizeof(CharType) == 1) {
+  if (sizeof(TValue) == 1) {
     splat64 = 0x0101010101010101llu;
   } else {
     splat64 = 0x0001000100010001llu;
@@ -553,33 +573,33 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
   int64_t i64_v2 = *reinterpret_cast<int64_t*>(&u64_v2);
   __m128i needle2 = _mm_set_epi64x(i64_v2, i64_v2);
 
-  size_t numBytes = length * sizeof(CharType);
+  size_t numBytes = length * sizeof(TValue);
   uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
   uintptr_t end = cur + numBytes;
 
   if (numBytes < 16) {
-    return TwoByteLoop<CharType>(cur, end, v1, v2);
+    return TwoByteLoop<TValue>(cur, end, v1, v2);
   }
 
   if (numBytes < 32) {
     uintptr_t a = cur;
     uintptr_t b = end - 16;
-    return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, nullptr, nullptr,
-                                      HaystackOverlap::Overlapping);
+    return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, nullptr, nullptr,
+                                    HaystackOverlap::Overlapping);
   }
 
   // Get the initial unaligned load out of the way. This will likely overlap
   // with the aligned stuff below, but the overlapped part should effectively
   // be free.
   __m128i haystack = _mm_loadu_si128(Cast128(cur));
-  __m128i cmp1 = CmpEq128<CharType>(needle1, haystack);
-  __m128i cmp2 = CmpEq128<CharType>(needle2, haystack);
+  __m128i cmp1 = CmpEq128<TValue>(needle1, haystack);
+  __m128i cmp2 = CmpEq128<TValue>(needle2, haystack);
   int cmpMask1 = _mm_movemask_epi8(cmp1);
   int cmpMask2 = _mm_movemask_epi8(cmp2);
-  int cmpMask = (cmpMask1 << sizeof(CharType)) & cmpMask2;
+  int cmpMask = (cmpMask1 << sizeof(TValue)) & cmpMask2;
   if (cmpMask) {
-    return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask) -
-                                             sizeof(CharType));
+    return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask) -
+                                           sizeof(TValue));
   }
 
   // Now we're working with aligned memory. Hooray! \o/
@@ -595,9 +615,9 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
   while (cur < tailStartPtr) {
     uintptr_t a = cur;
     uintptr_t b = cur + 16;
-    const CharType* result =
-        Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
-                                   &cmpMaskCarry, HaystackOverlap::Sequential);
+    const TValue* result =
+        Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry,
+                                 &cmpMaskCarry, HaystackOverlap::Sequential);
     if (result) {
       return result;
     }
@@ -609,8 +629,8 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
   cmpMaskCarry = _mm_and_si128(cmpMaskCarry, wideCarry);
   uintptr_t a = tailStartPtr;
   uintptr_t b = tailEndPtr;
-  return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
-                                    nullptr, HaystackOverlap::Overlapping);
+  return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry,
+                                  nullptr, HaystackOverlap::Overlapping);
 }
 
 const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
@@ -647,6 +667,14 @@ const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
   return memchr16SSE2(ptr, value, length);
 }
 
+const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value,
+                               size_t length) {
+  if (supports_avx2()) {
+    return FindInBufferAVX2<uint64_t>(ptr, value, length);
+  }
+  return FindInBufferNaive<uint64_t>(ptr, value, length);
+}
+
 const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
   // Signed chars are just really annoying to do bit logic with. Convert to
   // unsigned at the outermost scope so we don't have to worry about it.
@@ -679,14 +707,7 @@ const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
 
 const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
                                size_t length) {
-  const char16_t* end = ptr + length;
-  while (ptr < end) {
-    if (*ptr == value) {
-      return ptr;
-    }
-    ptr++;
-  }
-  return nullptr;
+  return FindInBufferNaive<char16_t>(ptr, value, length);
 }
 
 const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
@@ -694,6 +715,11 @@ const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
   return memchr16(ptr, value, length);
 }
 
+const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value,
+                               size_t length) {
+  return FindInBufferNaive<uint64_t>(ptr, value, length);
+}
+
 const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
   const char* end = ptr + length - 1;
   while (ptr < end) {
diff --git a/mfbt/SIMD.h b/mfbt/SIMD.h
index b3446b33e709..b42bd779c8c1 100644
--- a/mfbt/SIMD.h
+++ b/mfbt/SIMD.h
@@ -46,6 +46,11 @@ class SIMD {
   static MFBT_API const char16_t* memchr16SSE2(const char16_t* ptr,
                                                char16_t value, size_t length);
 
+  // Search through `ptr[0..length]` for the first occurrence of `value` and
+  // return the pointer to it, or nullptr if it cannot be found.
+  static MFBT_API const uint64_t* memchr64(const uint64_t* ptr, uint64_t value,
+                                           size_t length);
+
   // Search through `ptr[0..length]` for the first occurrence of `v1` which is
   // immediately followed by `v2` and return the pointer to the occurrence of
   // `v1`.
diff --git a/mfbt/tests/TestSIMD.cpp b/mfbt/tests/TestSIMD.cpp
index 5ea718c10ad9..23dc8b01174d 100644
--- a/mfbt/tests/TestSIMD.cpp
+++ b/mfbt/tests/TestSIMD.cpp
@@ -105,8 +105,8 @@ void TestLongString() {
     MOZ_RELEASE_ASSERT(
         SIMD::memchr8SSE2(test, static_cast<char>(i), count - 1) == test + i);
   }
-  MOZ_RELEASE_ASSERT(SIMD::memchr8(test, static_cast<char>(count - 1),
-                                       count - 1) == nullptr);
+  MOZ_RELEASE_ASSERT(
+      SIMD::memchr8(test, static_cast<char>(count - 1), count - 1) == nullptr);
 }
 
 void TestGauntlet() {
@@ -124,8 +124,8 @@ void TestGauntlet() {
           if (j >= k && j < i) {
             expected = test + j;
           }
-          MOZ_RELEASE_ASSERT(SIMD::memchr8(test + k, static_cast<char>(j),
-                                               i - k) == expected);
+          MOZ_RELEASE_ASSERT(
+              SIMD::memchr8(test + k, static_cast<char>(j), i - k) == expected);
           MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test + k, static_cast<char>(j),
                                                i - k) == expected);
         }
@@ -221,8 +221,8 @@ void TestLongString16() {
   }
 
   for (size_t i = 0; i < count - 1; ++i) {
-    MOZ_RELEASE_ASSERT(SIMD::memchr16(test, static_cast<char16_t>(i),
-                                          count - 1) == test + i);
+    MOZ_RELEASE_ASSERT(
+        SIMD::memchr16(test, static_cast<char16_t>(i), count - 1) == test + i);
     MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, static_cast<char16_t>(i),
                                           count - 1) == test + i);
   }
@@ -245,9 +245,8 @@ void TestGauntlet16() {
           if (j >= k && j < i) {
             expected = test + j;
           }
-          MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k,
-                                                static_cast<char16_t>(j),
-                                                i - k) == expected);
+          MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k, static_cast<char16_t>(j),
+                                            i - k) == expected);
           MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test + k,
                                                 static_cast<char16_t>(j),
                                                 i - k) == expected);
@@ -257,6 +256,113 @@ void TestGauntlet16() {
   }
 }
 
+void TestTinyString64() {
+  const uint64_t test[4] = {0, 1, 2, 3};
+
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 3) == test + 0x0);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 3) == test + 0x1);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 3) == test + 0x2);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 3) == nullptr);
+}
+
+void TestShortString64() {
+  const uint64_t test[16] = {0, 1, 2,  3,  4,  5,  6,  7,
+                             8, 9, 10, 11, 12, 13, 14, 15};
+
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 15) == test + 0);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 15) == test + 1);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 15) == test + 2);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 15) == test + 3);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 15) == test + 4);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 15) == test + 5);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 15) == test + 6);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 15) == test + 7);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 15) == test + 8);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 15) == test + 10);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 15) == test + 11);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 15) == test + 12);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 15) == test + 13);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 15) == test + 14);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 15) == nullptr);
+}
+
+void TestMediumString64() {
+  const uint64_t test[32] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
+                             11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                             22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 31) == test + 0);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 31) == test + 1);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 31) == test + 2);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 31) == test + 3);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 31) == test + 4);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 31) == test + 5);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 31) == test + 6);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 31) == test + 7);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 31) == test + 8);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 31) == test + 10);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 31) == test + 11);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 31) == test + 12);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 31) == test + 13);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 31) == test + 14);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 31) == test + 15);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 16, 31) == test + 16);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 17, 31) == test + 17);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 18, 31) == test + 18);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 19, 31) == test + 19);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 20, 31) == test + 20);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 21, 31) == test + 21);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 22, 31) == test + 22);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 23, 31) == test + 23);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 24, 31) == test + 24);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 25, 31) == test + 25);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 26, 31) == test + 26);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 27, 31) == test + 27);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 28, 31) == test + 28);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 29, 31) == test + 29);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 30, 31) == test + 30);
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 31, 31) == nullptr);
+}
+
+void TestLongString64() {
+  const size_t count = 256;
+  uint64_t test[count];
+  for (size_t i = 0; i < count; ++i) {
+    test[i] = i;
+  }
+
+  for (uint64_t i = 0; i < count - 1; ++i) {
+    MOZ_RELEASE_ASSERT(SIMD::memchr64(test, i, count - 1) == test + i);
+  }
+  MOZ_RELEASE_ASSERT(SIMD::memchr64(test, count - 1, count - 1) == nullptr);
+}
+
+void TestGauntlet64() {
+  const size_t count = 257;
+  uint64_t test[count];
+  for (size_t i = 0; i < count; ++i) {
+    test[i] = i;
+  }
+
+  for (uint64_t i = 0; i < count - 1; ++i) {
+    for (uint64_t j = 0; j < count - 1; ++j) {
+      for (uint64_t k = 0; k < count - 1; ++k) {
+        if (i >= k) {
+          const uint64_t* expected = nullptr;
+          if (j >= k && j < i) {
+            expected = test + j;
+          }
+          MOZ_RELEASE_ASSERT(SIMD::memchr64(test + k, j, i - k) == expected);
+        }
+      }
+    }
+  }
+}
+
 void TestTinyString2x8() {
   const char* test = "012\n";
 
@@ -498,6 +604,12 @@ int main(void) {
   TestLongString16();
   TestGauntlet16();
 
+  TestTinyString64();
+  TestShortString64();
+  TestMediumString64();
+  TestLongString64();
+  TestGauntlet64();
+
   TestTinyString2x8();
   TestShortString2x8();
   TestMediumString2x8();