From c2cde6897f804081462a7c5f1f1d1242c373c4d5 Mon Sep 17 00:00:00 2001 From: Doug Thayer Date: Fri, 29 Jul 2022 03:26:06 +0000 Subject: [PATCH] Bug 1779807 - Implement memchr64 in AVX2 r=iain This only makes sense for AVX2, because widening it from a 64-bit comparison to a 128-bit comparison is hardly worth it, and there are gaps in the SSE2 instruction set (missing _mm_cmpeq_epi64, which is introduced in SSE4.1) that would require us to compensate and probably take a sizeable perf hit. Differential Revision: https://phabricator.services.mozilla.com/D152297 --- mfbt/SIMD.cpp | 286 ++++++++++++++++++++++------------------ mfbt/SIMD.h | 5 + mfbt/tests/TestSIMD.cpp | 130 ++++++++++++++++-- 3 files changed, 282 insertions(+), 139 deletions(-) diff --git a/mfbt/SIMD.cpp b/mfbt/SIMD.cpp index 9d2bdffb7b28..1f9c5217f0a8 100644 --- a/mfbt/SIMD.cpp +++ b/mfbt/SIMD.cpp @@ -14,6 +14,19 @@ namespace mozilla { +template +const TValue* FindInBufferNaive(const TValue* ptr, TValue value, + size_t length) { + const TValue* end = ptr + length; + while (ptr < end) { + if (*ptr == value) { + return ptr; + } + ptr++; + } + return nullptr; +} + #ifdef MOZILLA_PRESUME_SSE2 # include @@ -41,22 +54,27 @@ uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; } uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); } -template +template __m128i CmpEq128(__m128i a, __m128i b) { - static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2); - if (sizeof(CharType) == 1) { + static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2); + if (sizeof(TValue) == 1) { return _mm_cmpeq_epi8(a, b); } return _mm_cmpeq_epi16(a, b); } -template +template __m256i CmpEq256(__m256i a, __m256i b) { - static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2); - if (sizeof(CharType) == 1) { + static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 || + sizeof(TValue) == 8); + if (sizeof(TValue) == 1) { return _mm256_cmpeq_epi8(a, b); } - return _mm256_cmpeq_epi16(a, b); + if (sizeof(TValue) == 2) { + return _mm256_cmpeq_epi16(a, b); + } + + return _mm256_cmpeq_epi64(a, b); } # ifdef __GNUC__ @@ -128,17 +146,17 @@ const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c, return nullptr; } -template -const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b, +template +const TValue* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c, uintptr_t d) { __m128i haystackA = Load64BitsIntoXMM(a); - __m128i cmpA = CmpEq128(needle, haystackA); + __m128i cmpA = CmpEq128(needle, haystackA); __m128i haystackB = Load64BitsIntoXMM(b); - __m128i cmpB = CmpEq128(needle, haystackB); + __m128i cmpB = CmpEq128(needle, haystackB); __m128i haystackC = Load64BitsIntoXMM(c); - __m128i cmpC = CmpEq128(needle, haystackC); + __m128i cmpC = CmpEq128(needle, haystackC); __m128i haystackD = Load64BitsIntoXMM(d); - __m128i cmpD = CmpEq128(needle, haystackD); + __m128i cmpD = CmpEq128(needle, haystackD); __m128i or_ab = _mm_or_si128(cmpA, cmpB); __m128i or_cd = _mm_or_si128(cmpC, cmpD); __m128i or_abcd = _mm_or_si128(or_ab, or_cd); @@ -147,36 +165,36 @@ const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b, int cmpMask; cmpMask = _mm_movemask_epi8(cmpA); if (cmpMask & 0xff) { - return reinterpret_cast(a + __builtin_ctz(cmpMask)); + return reinterpret_cast(a + __builtin_ctz(cmpMask)); } cmpMask = _mm_movemask_epi8(cmpB); if (cmpMask & 0xff) { - return reinterpret_cast(b + __builtin_ctz(cmpMask)); + return reinterpret_cast(b + __builtin_ctz(cmpMask)); } cmpMask = _mm_movemask_epi8(cmpC); if (cmpMask & 0xff) { - return reinterpret_cast(c + __builtin_ctz(cmpMask)); + return reinterpret_cast(c + __builtin_ctz(cmpMask)); } cmpMask = _mm_movemask_epi8(cmpD); if (cmpMask & 0xff) { - return reinterpret_cast(d + __builtin_ctz(cmpMask)); + return reinterpret_cast(d + __builtin_ctz(cmpMask)); } } return nullptr; } -template -const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b, - uintptr_t c, uintptr_t d) { +template +const TValue* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b, + uintptr_t c, uintptr_t d) { __m128i haystackA = _mm_loadu_si128(Cast128(a)); - __m128i cmpA = CmpEq128(needle, haystackA); + __m128i cmpA = CmpEq128(needle, haystackA); __m128i haystackB = _mm_loadu_si128(Cast128(b)); - __m128i cmpB = CmpEq128(needle, haystackB); + __m128i cmpB = CmpEq128(needle, haystackB); __m128i haystackC = _mm_loadu_si128(Cast128(c)); - __m128i cmpC = CmpEq128(needle, haystackC); + __m128i cmpC = CmpEq128(needle, haystackC); __m128i haystackD = _mm_loadu_si128(Cast128(d)); - __m128i cmpD = CmpEq128(needle, haystackD); + __m128i cmpD = CmpEq128(needle, haystackD); __m128i or_ab = _mm_or_si128(cmpA, cmpB); __m128i or_cd = _mm_or_si128(cmpC, cmpD); __m128i or_abcd = _mm_or_si128(or_ab, or_cd); @@ -185,36 +203,36 @@ const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b, int cmpMask; cmpMask = _mm_movemask_epi8(cmpA); if (cmpMask) { - return reinterpret_cast(a + __builtin_ctz(cmpMask)); + return reinterpret_cast(a + __builtin_ctz(cmpMask)); } cmpMask = _mm_movemask_epi8(cmpB); if (cmpMask) { - return reinterpret_cast(b + __builtin_ctz(cmpMask)); + return reinterpret_cast(b + __builtin_ctz(cmpMask)); } cmpMask = _mm_movemask_epi8(cmpC); if (cmpMask) { - return reinterpret_cast(c + __builtin_ctz(cmpMask)); + return reinterpret_cast(c + __builtin_ctz(cmpMask)); } cmpMask = _mm_movemask_epi8(cmpD); if (cmpMask) { - return reinterpret_cast(d + __builtin_ctz(cmpMask)); + return reinterpret_cast(d + __builtin_ctz(cmpMask)); } } return nullptr; } -template -const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b, - uintptr_t c, uintptr_t d) { +template +const TValue* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b, + uintptr_t c, uintptr_t d) { __m256i haystackA = _mm256_loadu_si256(Cast256(a)); - __m256i cmpA = CmpEq256(needle, haystackA); + __m256i cmpA = CmpEq256(needle, haystackA); __m256i haystackB = _mm256_loadu_si256(Cast256(b)); - __m256i cmpB = CmpEq256(needle, haystackB); + __m256i cmpB = CmpEq256(needle, haystackB); __m256i haystackC = _mm256_loadu_si256(Cast256(c)); - __m256i cmpC = CmpEq256(needle, haystackC); + __m256i cmpC = CmpEq256(needle, haystackC); __m256i haystackD = _mm256_loadu_si256(Cast256(d)); - __m256i cmpD = CmpEq256(needle, haystackD); + __m256i cmpD = CmpEq256(needle, haystackD); __m256i or_ab = _mm256_or_si256(cmpA, cmpB); __m256i or_cd = _mm256_or_si256(cmpC, cmpD); __m256i or_abcd = _mm256_or_si256(or_ab, or_cd); @@ -223,19 +241,19 @@ const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b, int cmpMask; cmpMask = _mm256_movemask_epi8(cmpA); if (cmpMask) { - return reinterpret_cast(a + __builtin_ctz(cmpMask)); + return reinterpret_cast(a + __builtin_ctz(cmpMask)); } cmpMask = _mm256_movemask_epi8(cmpB); if (cmpMask) { - return reinterpret_cast(b + __builtin_ctz(cmpMask)); + return reinterpret_cast(b + __builtin_ctz(cmpMask)); } cmpMask = _mm256_movemask_epi8(cmpC); if (cmpMask) { - return reinterpret_cast(c + __builtin_ctz(cmpMask)); + return reinterpret_cast(c + __builtin_ctz(cmpMask)); } cmpMask = _mm256_movemask_epi8(cmpD); if (cmpMask) { - return reinterpret_cast(d + __builtin_ctz(cmpMask)); + return reinterpret_cast(d + __builtin_ctz(cmpMask)); } } @@ -255,15 +273,15 @@ enum class HaystackOverlap { // the next a's 16-byte chunk is needle2. `overlap` and whether // `carryIn`/`carryOut` are NULL should be knowable at compile time to avoid // branching. -template -const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a, - uintptr_t b, __m128i* carryIn, - __m128i* carryOut, HaystackOverlap overlap) { - const int shiftRightAmount = 16 - sizeof(CharType); - const int shiftLeftAmount = sizeof(CharType); +template +const TValue* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a, + uintptr_t b, __m128i* carryIn, __m128i* carryOut, + HaystackOverlap overlap) { + const int shiftRightAmount = 16 - sizeof(TValue); + const int shiftLeftAmount = sizeof(TValue); __m128i haystackA = _mm_loadu_si128(Cast128(a)); - __m128i cmpA1 = CmpEq128(needle1, haystackA); - __m128i cmpA2 = CmpEq128(needle2, haystackA); + __m128i cmpA1 = CmpEq128(needle1, haystackA); + __m128i cmpA2 = CmpEq128(needle2, haystackA); __m128i cmpA; if (carryIn) { cmpA = _mm_and_si128( @@ -272,8 +290,8 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a, cmpA = _mm_and_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), cmpA2); } __m128i haystackB = _mm_loadu_si128(Cast128(b)); - __m128i cmpB1 = CmpEq128(needle1, haystackB); - __m128i cmpB2 = CmpEq128(needle2, haystackB); + __m128i cmpB1 = CmpEq128(needle1, haystackB); + __m128i cmpB2 = CmpEq128(needle2, haystackB); __m128i cmpB; if (overlap == HaystackOverlap::Overlapping) { cmpB = _mm_and_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), cmpB2); @@ -289,13 +307,13 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a, int cmpMask; cmpMask = _mm_movemask_epi8(cmpA); if (cmpMask) { - return reinterpret_cast(a + __builtin_ctz(cmpMask) - - shiftLeftAmount); + return reinterpret_cast(a + __builtin_ctz(cmpMask) - + shiftLeftAmount); } cmpMask = _mm_movemask_epi8(cmpB); if (cmpMask) { - return reinterpret_cast(b + __builtin_ctz(cmpMask) - - shiftLeftAmount); + return reinterpret_cast(b + __builtin_ctz(cmpMask) - + shiftLeftAmount); } } @@ -306,13 +324,12 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a, return nullptr; } -template -const CharType* FindInBuffer(const CharType* ptr, CharType value, - size_t length) { - static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2); - static_assert(std::is_unsigned::value); +template +const TValue* FindInBuffer(const TValue* ptr, TValue value, size_t length) { + static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2); + static_assert(std::is_unsigned::value); uint64_t splat64; - if (sizeof(CharType) == 1) { + if (sizeof(TValue) == 1) { splat64 = 0x0101010101010101llu; } else { splat64 = 0x0001000100010001llu; @@ -323,16 +340,16 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value, int64_t i64_value = *reinterpret_cast(&u64_value); __m128i needle = _mm_set_epi64x(i64_value, i64_value); - size_t numBytes = length * sizeof(CharType); + size_t numBytes = length * sizeof(TValue); uintptr_t cur = reinterpret_cast(ptr); uintptr_t end = cur + numBytes; - if ((sizeof(CharType) > 1 && numBytes < 16) || numBytes < 4) { + if ((sizeof(TValue) > 1 && numBytes < 16) || numBytes < 4) { while (cur < end) { - if (GetAs(cur) == value) { - return reinterpret_cast(cur); + if (GetAs(cur) == value) { + return reinterpret_cast(cur); } - cur += sizeof(CharType); + cur += sizeof(TValue); } return nullptr; } @@ -352,9 +369,9 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value, uintptr_t c = end - 4 - ((numBytes & 8) >> 1); uintptr_t d = end - 4; const char* charResult = Check4x4Chars(needle, a, b, c, d); - // Note: we ensure above that sizeof(CharType) == 1 here, so this is + // Note: we ensure above that sizeof(TValue) == 1 here, so this is // either char to char or char to something like a uint8_t. - return reinterpret_cast(charResult); + return reinterpret_cast(charResult); } if (numBytes < 64) { @@ -364,17 +381,17 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value, uintptr_t b = cur + ((numBytes & 32) >> 1); uintptr_t c = end - 16 - ((numBytes & 32) >> 1); uintptr_t d = end - 16; - return Check4x16Bytes(needle, a, b, c, d); + return Check4x16Bytes(needle, a, b, c, d); } // Get the initial unaligned load out of the way. This will overlap with the // aligned stuff below, but the overlapped part should effectively be free // (relative to a mispredict from doing a byte-by-byte loop). __m128i haystack = _mm_loadu_si128(Cast128(cur)); - __m128i cmp = CmpEq128(needle, haystack); + __m128i cmp = CmpEq128(needle, haystack); int cmpMask = _mm_movemask_epi8(cmp); if (cmpMask) { - return reinterpret_cast(cur + __builtin_ctz(cmpMask)); + return reinterpret_cast(cur + __builtin_ctz(cmpMask)); } // Now we're working with aligned memory. Hooray! \o/ @@ -391,7 +408,7 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value, uintptr_t b = cur + 16; uintptr_t c = cur + 32; uintptr_t d = cur + 48; - const CharType* result = Check4x16Bytes(needle, a, b, c, d); + const TValue* result = Check4x16Bytes(needle, a, b, c, d); if (result) { return result; } @@ -402,49 +419,53 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value, uintptr_t b = tailStartPtr + 16; uintptr_t c = tailStartPtr + 32; uintptr_t d = tailEndPtr; - return Check4x16Bytes(needle, a, b, c, d); + return Check4x16Bytes(needle, a, b, c, d); } -template -const CharType* FindInBufferAVX2(const CharType* ptr, CharType value, - size_t length) { - static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2); - static_assert(std::is_unsigned::value); +template +const TValue* FindInBufferAVX2(const TValue* ptr, TValue value, size_t length) { + static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 || + sizeof(TValue) == 8); + static_assert(std::is_unsigned::value); // Load our needle into a 32-byte register __m256i needle; - if (sizeof(CharType) == 1) { + if (sizeof(TValue) == 1) { needle = _mm256_set1_epi8(value); - } else { + } else if (sizeof(TValue) == 2) { needle = _mm256_set1_epi16(value); + } else { + needle = _mm256_set1_epi64x(value); } - size_t numBytes = length * sizeof(CharType); + size_t numBytes = length * sizeof(TValue); uintptr_t cur = reinterpret_cast(ptr); uintptr_t end = cur + numBytes; - if (numBytes < 8) { + if (numBytes < 8 || (sizeof(TValue) == 8 && numBytes < 32)) { while (cur < end) { - if (GetAs(cur) == value) { - return reinterpret_cast(cur); + if (GetAs(cur) == value) { + return reinterpret_cast(cur); } - cur += sizeof(CharType); + cur += sizeof(TValue); } return nullptr; } - if (numBytes < 32) { - __m128i needle_narrow; - if (sizeof(CharType) == 1) { - needle_narrow = _mm_set1_epi8(value); - } else { - needle_narrow = _mm_set1_epi16(value); + if constexpr (sizeof(TValue) != 8) { + if (numBytes < 32) { + __m128i needle_narrow; + if (sizeof(TValue) == 1) { + needle_narrow = _mm_set1_epi8(value); + } else { + needle_narrow = _mm_set1_epi16(value); + } + uintptr_t a = cur; + uintptr_t b = cur + ((numBytes & 16) >> 1); + uintptr_t c = end - 8 - ((numBytes & 16) >> 1); + uintptr_t d = end - 8; + return Check4x8Bytes(needle_narrow, a, b, c, d); } - uintptr_t a = cur; - uintptr_t b = cur + ((numBytes & 16) >> 1); - uintptr_t c = end - 8 - ((numBytes & 16) >> 1); - uintptr_t d = end - 8; - return Check4x8Bytes(needle_narrow, a, b, c, d); } if (numBytes < 128) { @@ -454,17 +475,17 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value, uintptr_t b = cur + ((numBytes & 64) >> 1); uintptr_t c = end - 32 - ((numBytes & 64) >> 1); uintptr_t d = end - 32; - return Check4x32Bytes(needle, a, b, c, d); + return Check4x32Bytes(needle, a, b, c, d); } // Get the initial unaligned load out of the way. This will overlap with the // aligned stuff below, but the overlapped part should effectively be free // (relative to a mispredict from doing a byte-by-byte loop). __m256i haystack = _mm256_loadu_si256(Cast256(cur)); - __m256i cmp = CmpEq256(needle, haystack); + __m256i cmp = CmpEq256(needle, haystack); int cmpMask = _mm256_movemask_epi8(cmp); if (cmpMask) { - return reinterpret_cast(cur + __builtin_ctz(cmpMask)); + return reinterpret_cast(cur + __builtin_ctz(cmpMask)); } // Now we're working with aligned memory. Hooray! \o/ @@ -478,7 +499,7 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value, uintptr_t b = cur + 32; uintptr_t c = cur + 64; uintptr_t d = cur + 96; - const CharType* result = Check4x32Bytes(needle, a, b, c, d); + const TValue* result = Check4x32Bytes(needle, a, b, c, d); if (result) { return result; } @@ -489,12 +510,11 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value, uintptr_t b = tailStartPtr + 32; uintptr_t c = tailStartPtr + 64; uintptr_t d = tailEndPtr; - return Check4x32Bytes(needle, a, b, c, d); + return Check4x32Bytes(needle, a, b, c, d); } -template -const CharType* TwoByteLoop(uintptr_t start, uintptr_t end, CharType v1, - CharType v2); +template +const TValue* TwoByteLoop(uintptr_t start, uintptr_t end, TValue v1, TValue v2); template <> const unsigned char* TwoByteLoop(uintptr_t start, uintptr_t end, @@ -533,13 +553,13 @@ const char16_t* TwoByteLoop(uintptr_t start, uintptr_t end, return nullptr; } -template -const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2, - size_t length) { - static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2); - static_assert(std::is_unsigned::value); +template +const TValue* FindTwoInBuffer(const TValue* ptr, TValue v1, TValue v2, + size_t length) { + static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2); + static_assert(std::is_unsigned::value); uint64_t splat64; - if (sizeof(CharType) == 1) { + if (sizeof(TValue) == 1) { splat64 = 0x0101010101010101llu; } else { splat64 = 0x0001000100010001llu; @@ -553,33 +573,33 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2, int64_t i64_v2 = *reinterpret_cast(&u64_v2); __m128i needle2 = _mm_set_epi64x(i64_v2, i64_v2); - size_t numBytes = length * sizeof(CharType); + size_t numBytes = length * sizeof(TValue); uintptr_t cur = reinterpret_cast(ptr); uintptr_t end = cur + numBytes; if (numBytes < 16) { - return TwoByteLoop(cur, end, v1, v2); + return TwoByteLoop(cur, end, v1, v2); } if (numBytes < 32) { uintptr_t a = cur; uintptr_t b = end - 16; - return Check2x2x16Bytes(needle1, needle2, a, b, nullptr, nullptr, - HaystackOverlap::Overlapping); + return Check2x2x16Bytes(needle1, needle2, a, b, nullptr, nullptr, + HaystackOverlap::Overlapping); } // Get the initial unaligned load out of the way. This will likely overlap // with the aligned stuff below, but the overlapped part should effectively // be free. __m128i haystack = _mm_loadu_si128(Cast128(cur)); - __m128i cmp1 = CmpEq128(needle1, haystack); - __m128i cmp2 = CmpEq128(needle2, haystack); + __m128i cmp1 = CmpEq128(needle1, haystack); + __m128i cmp2 = CmpEq128(needle2, haystack); int cmpMask1 = _mm_movemask_epi8(cmp1); int cmpMask2 = _mm_movemask_epi8(cmp2); - int cmpMask = (cmpMask1 << sizeof(CharType)) & cmpMask2; + int cmpMask = (cmpMask1 << sizeof(TValue)) & cmpMask2; if (cmpMask) { - return reinterpret_cast(cur + __builtin_ctz(cmpMask) - - sizeof(CharType)); + return reinterpret_cast(cur + __builtin_ctz(cmpMask) - + sizeof(TValue)); } // Now we're working with aligned memory. Hooray! \o/ @@ -595,9 +615,9 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2, while (cur < tailStartPtr) { uintptr_t a = cur; uintptr_t b = cur + 16; - const CharType* result = - Check2x2x16Bytes(needle1, needle2, a, b, &cmpMaskCarry, - &cmpMaskCarry, HaystackOverlap::Sequential); + const TValue* result = + Check2x2x16Bytes(needle1, needle2, a, b, &cmpMaskCarry, + &cmpMaskCarry, HaystackOverlap::Sequential); if (result) { return result; } @@ -609,8 +629,8 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2, cmpMaskCarry = _mm_and_si128(cmpMaskCarry, wideCarry); uintptr_t a = tailStartPtr; uintptr_t b = tailEndPtr; - return Check2x2x16Bytes(needle1, needle2, a, b, &cmpMaskCarry, - nullptr, HaystackOverlap::Overlapping); + return Check2x2x16Bytes(needle1, needle2, a, b, &cmpMaskCarry, + nullptr, HaystackOverlap::Overlapping); } const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) { @@ -647,6 +667,14 @@ const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value, return memchr16SSE2(ptr, value, length); } +const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value, + size_t length) { + if (supports_avx2()) { + return FindInBufferAVX2(ptr, value, length); + } + return FindInBufferNaive(ptr, value, length); +} + const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) { // Signed chars are just really annoying to do bit logic with. Convert to // unsigned at the outermost scope so we don't have to worry about it. @@ -679,14 +707,7 @@ const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) { const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value, size_t length) { - const char16_t* end = ptr + length; - while (ptr < end) { - if (*ptr == value) { - return ptr; - } - ptr++; - } - return nullptr; + return FindInBufferNaive(ptr, value, length); } const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value, @@ -694,6 +715,11 @@ const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value, return memchr16(ptr, value, length); } +const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value, + size_t length) { + return FindInBufferNaive(ptr, value, length); +} + const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) { const char* end = ptr + length - 1; while (ptr < end) { diff --git a/mfbt/SIMD.h b/mfbt/SIMD.h index b3446b33e709..b42bd779c8c1 100644 --- a/mfbt/SIMD.h +++ b/mfbt/SIMD.h @@ -46,6 +46,11 @@ class SIMD { static MFBT_API const char16_t* memchr16SSE2(const char16_t* ptr, char16_t value, size_t length); + // Search through `ptr[0..length]` for the first occurrence of `value` and + // return the pointer to it, or nullptr if it cannot be found. + static MFBT_API const uint64_t* memchr64(const uint64_t* ptr, uint64_t value, + size_t length); + // Search through `ptr[0..length]` for the first occurrence of `v1` which is // immediately followed by `v2` and return the pointer to the occurrence of // `v1`. diff --git a/mfbt/tests/TestSIMD.cpp b/mfbt/tests/TestSIMD.cpp index 5ea718c10ad9..23dc8b01174d 100644 --- a/mfbt/tests/TestSIMD.cpp +++ b/mfbt/tests/TestSIMD.cpp @@ -105,8 +105,8 @@ void TestLongString() { MOZ_RELEASE_ASSERT( SIMD::memchr8SSE2(test, static_cast(i), count - 1) == test + i); } - MOZ_RELEASE_ASSERT(SIMD::memchr8(test, static_cast(count - 1), - count - 1) == nullptr); + MOZ_RELEASE_ASSERT( + SIMD::memchr8(test, static_cast(count - 1), count - 1) == nullptr); } void TestGauntlet() { @@ -124,8 +124,8 @@ void TestGauntlet() { if (j >= k && j < i) { expected = test + j; } - MOZ_RELEASE_ASSERT(SIMD::memchr8(test + k, static_cast(j), - i - k) == expected); + MOZ_RELEASE_ASSERT( + SIMD::memchr8(test + k, static_cast(j), i - k) == expected); MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test + k, static_cast(j), i - k) == expected); } @@ -221,8 +221,8 @@ void TestLongString16() { } for (size_t i = 0; i < count - 1; ++i) { - MOZ_RELEASE_ASSERT(SIMD::memchr16(test, static_cast(i), - count - 1) == test + i); + MOZ_RELEASE_ASSERT( + SIMD::memchr16(test, static_cast(i), count - 1) == test + i); MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, static_cast(i), count - 1) == test + i); } @@ -245,9 +245,8 @@ void TestGauntlet16() { if (j >= k && j < i) { expected = test + j; } - MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k, - static_cast(j), - i - k) == expected); + MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k, static_cast(j), + i - k) == expected); MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test + k, static_cast(j), i - k) == expected); @@ -257,6 +256,113 @@ void TestGauntlet16() { } } +void TestTinyString64() { + const uint64_t test[4] = {0, 1, 2, 3}; + + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 3) == test + 0x0); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 3) == test + 0x1); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 3) == test + 0x2); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 3) == nullptr); +} + +void TestShortString64() { + const uint64_t test[16] = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 15) == test + 0); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 15) == test + 1); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 15) == test + 2); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 15) == test + 3); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 15) == test + 4); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 15) == test + 5); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 15) == test + 6); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 15) == test + 7); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 15) == test + 8); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 15) == test + 10); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 15) == test + 11); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 15) == test + 12); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 15) == test + 13); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 15) == test + 14); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 15) == nullptr); +} + +void TestMediumString64() { + const uint64_t test[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 31) == test + 0); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 31) == test + 1); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 31) == test + 2); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 31) == test + 3); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 31) == test + 4); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 31) == test + 5); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 31) == test + 6); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 31) == test + 7); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 31) == test + 8); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 31) == test + 10); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 31) == test + 11); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 31) == test + 12); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 31) == test + 13); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 31) == test + 14); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 31) == test + 15); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 16, 31) == test + 16); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 17, 31) == test + 17); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 18, 31) == test + 18); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 19, 31) == test + 19); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 20, 31) == test + 20); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 21, 31) == test + 21); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 22, 31) == test + 22); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 23, 31) == test + 23); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 24, 31) == test + 24); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 25, 31) == test + 25); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 26, 31) == test + 26); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 27, 31) == test + 27); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 28, 31) == test + 28); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 29, 31) == test + 29); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 30, 31) == test + 30); + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 31, 31) == nullptr); +} + +void TestLongString64() { + const size_t count = 256; + uint64_t test[count]; + for (size_t i = 0; i < count; ++i) { + test[i] = i; + } + + for (uint64_t i = 0; i < count - 1; ++i) { + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, i, count - 1) == test + i); + } + MOZ_RELEASE_ASSERT(SIMD::memchr64(test, count - 1, count - 1) == nullptr); +} + +void TestGauntlet64() { + const size_t count = 257; + uint64_t test[count]; + for (size_t i = 0; i < count; ++i) { + test[i] = i; + } + + for (uint64_t i = 0; i < count - 1; ++i) { + for (uint64_t j = 0; j < count - 1; ++j) { + for (uint64_t k = 0; k < count - 1; ++k) { + if (i >= k) { + const uint64_t* expected = nullptr; + if (j >= k && j < i) { + expected = test + j; + } + MOZ_RELEASE_ASSERT(SIMD::memchr64(test + k, j, i - k) == expected); + } + } + } + } +} + void TestTinyString2x8() { const char* test = "012\n"; @@ -498,6 +604,12 @@ int main(void) { TestLongString16(); TestGauntlet16(); + TestTinyString64(); + TestShortString64(); + TestMediumString64(); + TestLongString64(); + TestGauntlet64(); + TestTinyString2x8(); TestShortString2x8(); TestMediumString2x8();