Bug 1779807 - Implement memchr64 in AVX2 r=iain

This only makes sense for AVX2, because widening it from a 64-bit comparison
to a 128-bit comparison is hardly worth it, and there are gaps in the SSE2
instruction set (missing _mm_cmpeq_epi64, which is introduced in SSE4.1) that
would require us to compensate and probably take a sizeable perf hit.

Differential Revision: https://phabricator.services.mozilla.com/D152297
This commit is contained in:
Doug Thayer 2022-07-29 03:26:06 +00:00
parent 1f10d44d74
commit c2cde6897f
3 changed files with 282 additions and 139 deletions

View File

@ -14,6 +14,19 @@
namespace mozilla {
template <typename TValue>
const TValue* FindInBufferNaive(const TValue* ptr, TValue value,
size_t length) {
const TValue* end = ptr + length;
while (ptr < end) {
if (*ptr == value) {
return ptr;
}
ptr++;
}
return nullptr;
}
#ifdef MOZILLA_PRESUME_SSE2
# include <immintrin.h>
@ -41,22 +54,27 @@ uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; }
uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); }
template <typename CharType>
template <typename TValue>
__m128i CmpEq128(__m128i a, __m128i b) {
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
if (sizeof(CharType) == 1) {
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
if (sizeof(TValue) == 1) {
return _mm_cmpeq_epi8(a, b);
}
return _mm_cmpeq_epi16(a, b);
}
template <typename CharType>
template <typename TValue>
__m256i CmpEq256(__m256i a, __m256i b) {
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
if (sizeof(CharType) == 1) {
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
sizeof(TValue) == 8);
if (sizeof(TValue) == 1) {
return _mm256_cmpeq_epi8(a, b);
}
return _mm256_cmpeq_epi16(a, b);
if (sizeof(TValue) == 2) {
return _mm256_cmpeq_epi16(a, b);
}
return _mm256_cmpeq_epi64(a, b);
}
# ifdef __GNUC__
@ -128,17 +146,17 @@ const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c,
return nullptr;
}
template <typename CharType>
const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
template <typename TValue>
const TValue* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
uintptr_t c, uintptr_t d) {
__m128i haystackA = Load64BitsIntoXMM(a);
__m128i cmpA = CmpEq128<CharType>(needle, haystackA);
__m128i cmpA = CmpEq128<TValue>(needle, haystackA);
__m128i haystackB = Load64BitsIntoXMM(b);
__m128i cmpB = CmpEq128<CharType>(needle, haystackB);
__m128i cmpB = CmpEq128<TValue>(needle, haystackB);
__m128i haystackC = Load64BitsIntoXMM(c);
__m128i cmpC = CmpEq128<CharType>(needle, haystackC);
__m128i cmpC = CmpEq128<TValue>(needle, haystackC);
__m128i haystackD = Load64BitsIntoXMM(d);
__m128i cmpD = CmpEq128<CharType>(needle, haystackD);
__m128i cmpD = CmpEq128<TValue>(needle, haystackD);
__m128i or_ab = _mm_or_si128(cmpA, cmpB);
__m128i or_cd = _mm_or_si128(cmpC, cmpD);
__m128i or_abcd = _mm_or_si128(or_ab, or_cd);
@ -147,36 +165,36 @@ const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
int cmpMask;
cmpMask = _mm_movemask_epi8(cmpA);
if (cmpMask & 0xff) {
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpB);
if (cmpMask & 0xff) {
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpC);
if (cmpMask & 0xff) {
return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpD);
if (cmpMask & 0xff) {
return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
}
}
return nullptr;
}
template <typename CharType>
const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
uintptr_t c, uintptr_t d) {
template <typename TValue>
const TValue* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
uintptr_t c, uintptr_t d) {
__m128i haystackA = _mm_loadu_si128(Cast128(a));
__m128i cmpA = CmpEq128<CharType>(needle, haystackA);
__m128i cmpA = CmpEq128<TValue>(needle, haystackA);
__m128i haystackB = _mm_loadu_si128(Cast128(b));
__m128i cmpB = CmpEq128<CharType>(needle, haystackB);
__m128i cmpB = CmpEq128<TValue>(needle, haystackB);
__m128i haystackC = _mm_loadu_si128(Cast128(c));
__m128i cmpC = CmpEq128<CharType>(needle, haystackC);
__m128i cmpC = CmpEq128<TValue>(needle, haystackC);
__m128i haystackD = _mm_loadu_si128(Cast128(d));
__m128i cmpD = CmpEq128<CharType>(needle, haystackD);
__m128i cmpD = CmpEq128<TValue>(needle, haystackD);
__m128i or_ab = _mm_or_si128(cmpA, cmpB);
__m128i or_cd = _mm_or_si128(cmpC, cmpD);
__m128i or_abcd = _mm_or_si128(or_ab, or_cd);
@ -185,36 +203,36 @@ const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
int cmpMask;
cmpMask = _mm_movemask_epi8(cmpA);
if (cmpMask) {
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpB);
if (cmpMask) {
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpC);
if (cmpMask) {
return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpD);
if (cmpMask) {
return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
}
}
return nullptr;
}
template <typename CharType>
const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
uintptr_t c, uintptr_t d) {
template <typename TValue>
const TValue* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
uintptr_t c, uintptr_t d) {
__m256i haystackA = _mm256_loadu_si256(Cast256(a));
__m256i cmpA = CmpEq256<CharType>(needle, haystackA);
__m256i cmpA = CmpEq256<TValue>(needle, haystackA);
__m256i haystackB = _mm256_loadu_si256(Cast256(b));
__m256i cmpB = CmpEq256<CharType>(needle, haystackB);
__m256i cmpB = CmpEq256<TValue>(needle, haystackB);
__m256i haystackC = _mm256_loadu_si256(Cast256(c));
__m256i cmpC = CmpEq256<CharType>(needle, haystackC);
__m256i cmpC = CmpEq256<TValue>(needle, haystackC);
__m256i haystackD = _mm256_loadu_si256(Cast256(d));
__m256i cmpD = CmpEq256<CharType>(needle, haystackD);
__m256i cmpD = CmpEq256<TValue>(needle, haystackD);
__m256i or_ab = _mm256_or_si256(cmpA, cmpB);
__m256i or_cd = _mm256_or_si256(cmpC, cmpD);
__m256i or_abcd = _mm256_or_si256(or_ab, or_cd);
@ -223,19 +241,19 @@ const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
int cmpMask;
cmpMask = _mm256_movemask_epi8(cmpA);
if (cmpMask) {
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
}
cmpMask = _mm256_movemask_epi8(cmpB);
if (cmpMask) {
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
}
cmpMask = _mm256_movemask_epi8(cmpC);
if (cmpMask) {
return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
}
cmpMask = _mm256_movemask_epi8(cmpD);
if (cmpMask) {
return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
}
}
@ -255,15 +273,15 @@ enum class HaystackOverlap {
// the next a's 16-byte chunk is needle2. `overlap` and whether
// `carryIn`/`carryOut` are NULL should be knowable at compile time to avoid
// branching.
template <typename CharType>
const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
uintptr_t b, __m128i* carryIn,
__m128i* carryOut, HaystackOverlap overlap) {
const int shiftRightAmount = 16 - sizeof(CharType);
const int shiftLeftAmount = sizeof(CharType);
template <typename TValue>
const TValue* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
uintptr_t b, __m128i* carryIn, __m128i* carryOut,
HaystackOverlap overlap) {
const int shiftRightAmount = 16 - sizeof(TValue);
const int shiftLeftAmount = sizeof(TValue);
__m128i haystackA = _mm_loadu_si128(Cast128(a));
__m128i cmpA1 = CmpEq128<CharType>(needle1, haystackA);
__m128i cmpA2 = CmpEq128<CharType>(needle2, haystackA);
__m128i cmpA1 = CmpEq128<TValue>(needle1, haystackA);
__m128i cmpA2 = CmpEq128<TValue>(needle2, haystackA);
__m128i cmpA;
if (carryIn) {
cmpA = _mm_and_si128(
@ -272,8 +290,8 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
cmpA = _mm_and_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), cmpA2);
}
__m128i haystackB = _mm_loadu_si128(Cast128(b));
__m128i cmpB1 = CmpEq128<CharType>(needle1, haystackB);
__m128i cmpB2 = CmpEq128<CharType>(needle2, haystackB);
__m128i cmpB1 = CmpEq128<TValue>(needle1, haystackB);
__m128i cmpB2 = CmpEq128<TValue>(needle2, haystackB);
__m128i cmpB;
if (overlap == HaystackOverlap::Overlapping) {
cmpB = _mm_and_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), cmpB2);
@ -289,13 +307,13 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
int cmpMask;
cmpMask = _mm_movemask_epi8(cmpA);
if (cmpMask) {
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask) -
shiftLeftAmount);
return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask) -
shiftLeftAmount);
}
cmpMask = _mm_movemask_epi8(cmpB);
if (cmpMask) {
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask) -
shiftLeftAmount);
return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask) -
shiftLeftAmount);
}
}
@ -306,13 +324,12 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
return nullptr;
}
template <typename CharType>
const CharType* FindInBuffer(const CharType* ptr, CharType value,
size_t length) {
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
static_assert(std::is_unsigned<CharType>::value);
template <typename TValue>
const TValue* FindInBuffer(const TValue* ptr, TValue value, size_t length) {
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
static_assert(std::is_unsigned<TValue>::value);
uint64_t splat64;
if (sizeof(CharType) == 1) {
if (sizeof(TValue) == 1) {
splat64 = 0x0101010101010101llu;
} else {
splat64 = 0x0001000100010001llu;
@ -323,16 +340,16 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
int64_t i64_value = *reinterpret_cast<int64_t*>(&u64_value);
__m128i needle = _mm_set_epi64x(i64_value, i64_value);
size_t numBytes = length * sizeof(CharType);
size_t numBytes = length * sizeof(TValue);
uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
uintptr_t end = cur + numBytes;
if ((sizeof(CharType) > 1 && numBytes < 16) || numBytes < 4) {
if ((sizeof(TValue) > 1 && numBytes < 16) || numBytes < 4) {
while (cur < end) {
if (GetAs<CharType>(cur) == value) {
return reinterpret_cast<const CharType*>(cur);
if (GetAs<TValue>(cur) == value) {
return reinterpret_cast<const TValue*>(cur);
}
cur += sizeof(CharType);
cur += sizeof(TValue);
}
return nullptr;
}
@ -352,9 +369,9 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
uintptr_t c = end - 4 - ((numBytes & 8) >> 1);
uintptr_t d = end - 4;
const char* charResult = Check4x4Chars(needle, a, b, c, d);
// Note: we ensure above that sizeof(CharType) == 1 here, so this is
// Note: we ensure above that sizeof(TValue) == 1 here, so this is
// either char to char or char to something like a uint8_t.
return reinterpret_cast<const CharType*>(charResult);
return reinterpret_cast<const TValue*>(charResult);
}
if (numBytes < 64) {
@ -364,17 +381,17 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
uintptr_t b = cur + ((numBytes & 32) >> 1);
uintptr_t c = end - 16 - ((numBytes & 32) >> 1);
uintptr_t d = end - 16;
return Check4x16Bytes<CharType>(needle, a, b, c, d);
return Check4x16Bytes<TValue>(needle, a, b, c, d);
}
// Get the initial unaligned load out of the way. This will overlap with the
// aligned stuff below, but the overlapped part should effectively be free
// (relative to a mispredict from doing a byte-by-byte loop).
__m128i haystack = _mm_loadu_si128(Cast128(cur));
__m128i cmp = CmpEq128<CharType>(needle, haystack);
__m128i cmp = CmpEq128<TValue>(needle, haystack);
int cmpMask = _mm_movemask_epi8(cmp);
if (cmpMask) {
return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
}
// Now we're working with aligned memory. Hooray! \o/
@ -391,7 +408,7 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
uintptr_t b = cur + 16;
uintptr_t c = cur + 32;
uintptr_t d = cur + 48;
const CharType* result = Check4x16Bytes<CharType>(needle, a, b, c, d);
const TValue* result = Check4x16Bytes<TValue>(needle, a, b, c, d);
if (result) {
return result;
}
@ -402,49 +419,53 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
uintptr_t b = tailStartPtr + 16;
uintptr_t c = tailStartPtr + 32;
uintptr_t d = tailEndPtr;
return Check4x16Bytes<CharType>(needle, a, b, c, d);
return Check4x16Bytes<TValue>(needle, a, b, c, d);
}
template <typename CharType>
const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
size_t length) {
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
static_assert(std::is_unsigned<CharType>::value);
template <typename TValue>
const TValue* FindInBufferAVX2(const TValue* ptr, TValue value, size_t length) {
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
sizeof(TValue) == 8);
static_assert(std::is_unsigned<TValue>::value);
// Load our needle into a 32-byte register
__m256i needle;
if (sizeof(CharType) == 1) {
if (sizeof(TValue) == 1) {
needle = _mm256_set1_epi8(value);
} else {
} else if (sizeof(TValue) == 2) {
needle = _mm256_set1_epi16(value);
} else {
needle = _mm256_set1_epi64x(value);
}
size_t numBytes = length * sizeof(CharType);
size_t numBytes = length * sizeof(TValue);
uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
uintptr_t end = cur + numBytes;
if (numBytes < 8) {
if (numBytes < 8 || (sizeof(TValue) == 8 && numBytes < 32)) {
while (cur < end) {
if (GetAs<CharType>(cur) == value) {
return reinterpret_cast<const CharType*>(cur);
if (GetAs<TValue>(cur) == value) {
return reinterpret_cast<const TValue*>(cur);
}
cur += sizeof(CharType);
cur += sizeof(TValue);
}
return nullptr;
}
if (numBytes < 32) {
__m128i needle_narrow;
if (sizeof(CharType) == 1) {
needle_narrow = _mm_set1_epi8(value);
} else {
needle_narrow = _mm_set1_epi16(value);
if constexpr (sizeof(TValue) != 8) {
if (numBytes < 32) {
__m128i needle_narrow;
if (sizeof(TValue) == 1) {
needle_narrow = _mm_set1_epi8(value);
} else {
needle_narrow = _mm_set1_epi16(value);
}
uintptr_t a = cur;
uintptr_t b = cur + ((numBytes & 16) >> 1);
uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
uintptr_t d = end - 8;
return Check4x8Bytes<TValue>(needle_narrow, a, b, c, d);
}
uintptr_t a = cur;
uintptr_t b = cur + ((numBytes & 16) >> 1);
uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
uintptr_t d = end - 8;
return Check4x8Bytes<CharType>(needle_narrow, a, b, c, d);
}
if (numBytes < 128) {
@ -454,17 +475,17 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
uintptr_t b = cur + ((numBytes & 64) >> 1);
uintptr_t c = end - 32 - ((numBytes & 64) >> 1);
uintptr_t d = end - 32;
return Check4x32Bytes<CharType>(needle, a, b, c, d);
return Check4x32Bytes<TValue>(needle, a, b, c, d);
}
// Get the initial unaligned load out of the way. This will overlap with the
// aligned stuff below, but the overlapped part should effectively be free
// (relative to a mispredict from doing a byte-by-byte loop).
__m256i haystack = _mm256_loadu_si256(Cast256(cur));
__m256i cmp = CmpEq256<CharType>(needle, haystack);
__m256i cmp = CmpEq256<TValue>(needle, haystack);
int cmpMask = _mm256_movemask_epi8(cmp);
if (cmpMask) {
return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
}
// Now we're working with aligned memory. Hooray! \o/
@ -478,7 +499,7 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
uintptr_t b = cur + 32;
uintptr_t c = cur + 64;
uintptr_t d = cur + 96;
const CharType* result = Check4x32Bytes<CharType>(needle, a, b, c, d);
const TValue* result = Check4x32Bytes<TValue>(needle, a, b, c, d);
if (result) {
return result;
}
@ -489,12 +510,11 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
uintptr_t b = tailStartPtr + 32;
uintptr_t c = tailStartPtr + 64;
uintptr_t d = tailEndPtr;
return Check4x32Bytes<CharType>(needle, a, b, c, d);
return Check4x32Bytes<TValue>(needle, a, b, c, d);
}
template <typename CharType>
const CharType* TwoByteLoop(uintptr_t start, uintptr_t end, CharType v1,
CharType v2);
template <typename TValue>
const TValue* TwoByteLoop(uintptr_t start, uintptr_t end, TValue v1, TValue v2);
template <>
const unsigned char* TwoByteLoop<unsigned char>(uintptr_t start, uintptr_t end,
@ -533,13 +553,13 @@ const char16_t* TwoByteLoop<char16_t>(uintptr_t start, uintptr_t end,
return nullptr;
}
template <typename CharType>
const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
size_t length) {
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
static_assert(std::is_unsigned<CharType>::value);
template <typename TValue>
const TValue* FindTwoInBuffer(const TValue* ptr, TValue v1, TValue v2,
size_t length) {
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
static_assert(std::is_unsigned<TValue>::value);
uint64_t splat64;
if (sizeof(CharType) == 1) {
if (sizeof(TValue) == 1) {
splat64 = 0x0101010101010101llu;
} else {
splat64 = 0x0001000100010001llu;
@ -553,33 +573,33 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
int64_t i64_v2 = *reinterpret_cast<int64_t*>(&u64_v2);
__m128i needle2 = _mm_set_epi64x(i64_v2, i64_v2);
size_t numBytes = length * sizeof(CharType);
size_t numBytes = length * sizeof(TValue);
uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
uintptr_t end = cur + numBytes;
if (numBytes < 16) {
return TwoByteLoop<CharType>(cur, end, v1, v2);
return TwoByteLoop<TValue>(cur, end, v1, v2);
}
if (numBytes < 32) {
uintptr_t a = cur;
uintptr_t b = end - 16;
return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, nullptr, nullptr,
HaystackOverlap::Overlapping);
return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, nullptr, nullptr,
HaystackOverlap::Overlapping);
}
// Get the initial unaligned load out of the way. This will likely overlap
// with the aligned stuff below, but the overlapped part should effectively
// be free.
__m128i haystack = _mm_loadu_si128(Cast128(cur));
__m128i cmp1 = CmpEq128<CharType>(needle1, haystack);
__m128i cmp2 = CmpEq128<CharType>(needle2, haystack);
__m128i cmp1 = CmpEq128<TValue>(needle1, haystack);
__m128i cmp2 = CmpEq128<TValue>(needle2, haystack);
int cmpMask1 = _mm_movemask_epi8(cmp1);
int cmpMask2 = _mm_movemask_epi8(cmp2);
int cmpMask = (cmpMask1 << sizeof(CharType)) & cmpMask2;
int cmpMask = (cmpMask1 << sizeof(TValue)) & cmpMask2;
if (cmpMask) {
return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask) -
sizeof(CharType));
return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask) -
sizeof(TValue));
}
// Now we're working with aligned memory. Hooray! \o/
@ -595,9 +615,9 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
while (cur < tailStartPtr) {
uintptr_t a = cur;
uintptr_t b = cur + 16;
const CharType* result =
Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
&cmpMaskCarry, HaystackOverlap::Sequential);
const TValue* result =
Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry,
&cmpMaskCarry, HaystackOverlap::Sequential);
if (result) {
return result;
}
@ -609,8 +629,8 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
cmpMaskCarry = _mm_and_si128(cmpMaskCarry, wideCarry);
uintptr_t a = tailStartPtr;
uintptr_t b = tailEndPtr;
return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
nullptr, HaystackOverlap::Overlapping);
return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry,
nullptr, HaystackOverlap::Overlapping);
}
const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
@ -647,6 +667,14 @@ const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
return memchr16SSE2(ptr, value, length);
}
const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value,
size_t length) {
if (supports_avx2()) {
return FindInBufferAVX2<uint64_t>(ptr, value, length);
}
return FindInBufferNaive<uint64_t>(ptr, value, length);
}
const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
// Signed chars are just really annoying to do bit logic with. Convert to
// unsigned at the outermost scope so we don't have to worry about it.
@ -679,14 +707,7 @@ const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
size_t length) {
const char16_t* end = ptr + length;
while (ptr < end) {
if (*ptr == value) {
return ptr;
}
ptr++;
}
return nullptr;
return FindInBufferNaive<char16_t>(ptr, value, length);
}
const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
@ -694,6 +715,11 @@ const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
return memchr16(ptr, value, length);
}
const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value,
size_t length) {
return FindInBufferNaive<uint64_t>(ptr, value, length);
}
const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
const char* end = ptr + length - 1;
while (ptr < end) {

View File

@ -46,6 +46,11 @@ class SIMD {
static MFBT_API const char16_t* memchr16SSE2(const char16_t* ptr,
char16_t value, size_t length);
// Search through `ptr[0..length]` for the first occurrence of `value` and
// return the pointer to it, or nullptr if it cannot be found.
static MFBT_API const uint64_t* memchr64(const uint64_t* ptr, uint64_t value,
size_t length);
// Search through `ptr[0..length]` for the first occurrence of `v1` which is
// immediately followed by `v2` and return the pointer to the occurrence of
// `v1`.

View File

@ -105,8 +105,8 @@ void TestLongString() {
MOZ_RELEASE_ASSERT(
SIMD::memchr8SSE2(test, static_cast<char>(i), count - 1) == test + i);
}
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, static_cast<char>(count - 1),
count - 1) == nullptr);
MOZ_RELEASE_ASSERT(
SIMD::memchr8(test, static_cast<char>(count - 1), count - 1) == nullptr);
}
void TestGauntlet() {
@ -124,8 +124,8 @@ void TestGauntlet() {
if (j >= k && j < i) {
expected = test + j;
}
MOZ_RELEASE_ASSERT(SIMD::memchr8(test + k, static_cast<char>(j),
i - k) == expected);
MOZ_RELEASE_ASSERT(
SIMD::memchr8(test + k, static_cast<char>(j), i - k) == expected);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test + k, static_cast<char>(j),
i - k) == expected);
}
@ -221,8 +221,8 @@ void TestLongString16() {
}
for (size_t i = 0; i < count - 1; ++i) {
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, static_cast<char16_t>(i),
count - 1) == test + i);
MOZ_RELEASE_ASSERT(
SIMD::memchr16(test, static_cast<char16_t>(i), count - 1) == test + i);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, static_cast<char16_t>(i),
count - 1) == test + i);
}
@ -245,9 +245,8 @@ void TestGauntlet16() {
if (j >= k && j < i) {
expected = test + j;
}
MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k,
static_cast<char16_t>(j),
i - k) == expected);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k, static_cast<char16_t>(j),
i - k) == expected);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test + k,
static_cast<char16_t>(j),
i - k) == expected);
@ -257,6 +256,113 @@ void TestGauntlet16() {
}
}
void TestTinyString64() {
const uint64_t test[4] = {0, 1, 2, 3};
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 3) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 3) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 3) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 3) == nullptr);
}
void TestShortString64() {
const uint64_t test[16] = {0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15};
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 15) == test + 0);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 15) == test + 1);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 15) == test + 2);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 15) == test + 3);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 15) == test + 4);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 15) == test + 5);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 15) == test + 6);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 15) == test + 7);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 15) == test + 8);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 15) == test + 10);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 15) == test + 11);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 15) == test + 12);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 15) == test + 13);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 15) == test + 14);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 15) == nullptr);
}
void TestMediumString64() {
const uint64_t test[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 31) == test + 0);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 31) == test + 1);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 31) == test + 2);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 31) == test + 3);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 31) == test + 4);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 31) == test + 5);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 31) == test + 6);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 31) == test + 7);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 31) == test + 8);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 31) == test + 10);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 31) == test + 11);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 31) == test + 12);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 31) == test + 13);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 31) == test + 14);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 31) == test + 15);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 16, 31) == test + 16);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 17, 31) == test + 17);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 18, 31) == test + 18);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 19, 31) == test + 19);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 20, 31) == test + 20);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 21, 31) == test + 21);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 22, 31) == test + 22);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 23, 31) == test + 23);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 24, 31) == test + 24);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 25, 31) == test + 25);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 26, 31) == test + 26);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 27, 31) == test + 27);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 28, 31) == test + 28);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 29, 31) == test + 29);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 30, 31) == test + 30);
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 31, 31) == nullptr);
}
void TestLongString64() {
const size_t count = 256;
uint64_t test[count];
for (size_t i = 0; i < count; ++i) {
test[i] = i;
}
for (uint64_t i = 0; i < count - 1; ++i) {
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, i, count - 1) == test + i);
}
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, count - 1, count - 1) == nullptr);
}
void TestGauntlet64() {
const size_t count = 257;
uint64_t test[count];
for (size_t i = 0; i < count; ++i) {
test[i] = i;
}
for (uint64_t i = 0; i < count - 1; ++i) {
for (uint64_t j = 0; j < count - 1; ++j) {
for (uint64_t k = 0; k < count - 1; ++k) {
if (i >= k) {
const uint64_t* expected = nullptr;
if (j >= k && j < i) {
expected = test + j;
}
MOZ_RELEASE_ASSERT(SIMD::memchr64(test + k, j, i - k) == expected);
}
}
}
}
}
void TestTinyString2x8() {
const char* test = "012\n";
@ -498,6 +604,12 @@ int main(void) {
TestLongString16();
TestGauntlet16();
TestTinyString64();
TestShortString64();
TestMediumString64();
TestLongString64();
TestGauntlet64();
TestTinyString2x8();
TestShortString2x8();
TestMediumString2x8();