mirror of
https://github.com/FEX-Emu/xxHash.git
synced 2025-02-03 21:22:43 +00:00
tested new 64bits accumulator
disguised as xxh128
This commit is contained in:
parent
095c978912
commit
3622b1496e
142
xxh3.h
142
xxh3.h
@ -376,9 +376,9 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
|
||||
__m256i const d = _mm256_loadu_si256 (xdata+i);
|
||||
__m256i const k = _mm256_loadu_si256 (xkey+i);
|
||||
__m256i const dk = _mm256_xor_si256 (d,k); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
|
||||
__m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
|
||||
__m256i const mul = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
|
||||
__m256i const add = _mm256_add_epi64(d, xacc[i]);
|
||||
xacc[i] = _mm256_add_epi64(res, add);
|
||||
xacc[i] = _mm256_add_epi64(mul, add);
|
||||
}
|
||||
}
|
||||
|
||||
@ -394,9 +394,9 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
|
||||
__m128i const d = _mm_loadu_si128 (xdata+i);
|
||||
__m128i const k = _mm_loadu_si128 (xkey+i);
|
||||
__m128i const dk = _mm_xor_si128 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
|
||||
__m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31)); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
|
||||
__m128i const mul = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31)); /* uint64 mul[2] = {dk0*dk1,dk2*dk3} */
|
||||
__m128i const add = _mm_add_epi64(d, xacc[i]);
|
||||
xacc[i] = _mm_add_epi64(res, add);
|
||||
xacc[i] = _mm_add_epi64(mul, add);
|
||||
}
|
||||
}
|
||||
|
||||
@ -777,10 +777,136 @@ XXH3_len_0to16_128b(const void* data, size_t len, XXH64_hash_t seed)
|
||||
XXH_FORCE_INLINE void
|
||||
XXH3_accumulate128_512bits(void* acc, const void* restrict data, const void* restrict key)
|
||||
{
|
||||
#if 1 // (XXH_VECTOR == XXH_SSE2)
|
||||
|
||||
#if 0
|
||||
assert(((size_t)acc) & 15 == 0);
|
||||
{ ALIGN(16) __m128i* const xacc = (__m128i*) acc;
|
||||
const __m128i* const xdata = (const __m128i *) data;
|
||||
const __m128i* const xkey = (const __m128i *) key;
|
||||
|
||||
const __m128i k1 = _mm_set1_epi32((int)PRIME32_1);
|
||||
const __m128i k2 = _mm_set1_epi32((int)PRIME32_2);
|
||||
|
||||
size_t i;
|
||||
for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
|
||||
__m128i const d = _mm_loadu_si128 (xdata+i);
|
||||
__m128i const k = _mm_loadu_si128 (xkey+i);
|
||||
__m128i const dk = _mm_xor_si128 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
|
||||
|
||||
__m128i Vacc = xacc[i];
|
||||
|
||||
Vacc = _mm_add_epi64(Vacc, dk);
|
||||
Vacc = _mm_xor_si128(Vacc, _mm_srli_epi64(Vacc, 43)); /* Vacc ^= (Vacc >> 43); */
|
||||
|
||||
{ __m128i const lok1 = _mm_mul_epu32 (Vacc, k1);
|
||||
__m128i const hi = _mm_shuffle_epi32 (Vacc, 0x31);
|
||||
__m128i const hik1 = _mm_mul_epu32 (hi, k1);
|
||||
__m128i const hi64 = _mm_slli_epi64(hik1, 32);
|
||||
Vacc = _mm_add_epi64(lok1, hi64);
|
||||
}
|
||||
|
||||
Vacc = _mm_add_epi64(Vacc, _mm_shuffle_epi32(dk, _MM_SHUFFLE(1,0,3,2)) );
|
||||
Vacc = _mm_xor_si128(Vacc, _mm_srli_epi64(Vacc, 37)); /* Vacc ^= (Vacc >> 37); */
|
||||
|
||||
{ __m128i const lok2 = _mm_mul_epu32 (Vacc, k2);
|
||||
__m128i const hi = _mm_shuffle_epi32 (Vacc, 0x31);
|
||||
__m128i const hik2 = _mm_mul_epu32 (hi, k2);
|
||||
__m128i const hi64 = _mm_slli_epi64(hik2, 32);
|
||||
Vacc = _mm_add_epi64(lok2, hi64);
|
||||
}
|
||||
|
||||
xacc[i] = Vacc;
|
||||
}
|
||||
}
|
||||
|
||||
#elif 1 // SSE2
|
||||
|
||||
// merged, then mix
|
||||
// note : actually a 64-bits mixer
|
||||
|
||||
assert(((size_t)acc) & 15 == 0);
|
||||
{ ALIGN(16) __m128i* const xacc = (__m128i *) acc;
|
||||
const __m128i* const xdata = (const __m128i *) data;
|
||||
const __m128i* const xkey = (const __m128i *) key;
|
||||
|
||||
const __m128i k1 = _mm_set1_epi32((int)(PRIME32_1-1));
|
||||
|
||||
size_t i;
|
||||
for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
|
||||
__m128i const d = _mm_loadu_si128 (xdata+i);
|
||||
__m128i const k = _mm_loadu_si128 (xkey+i);
|
||||
__m128i const dk = _mm_xor_si128 (d,k);
|
||||
|
||||
__m128i const Vacc = _mm_add_epi64(xacc[i], dk);
|
||||
|
||||
__m128i const shifted = _mm_srli_epi64(Vacc, 32);
|
||||
__m128i const xored = _mm_xor_si128 (Vacc, shifted);
|
||||
|
||||
__m128i const mul = _mm_mul_epu32 (xored, k1);
|
||||
__m128i const muladd = _mm_add_epi64(xored, mul);
|
||||
|
||||
xacc[i] = muladd;
|
||||
}
|
||||
}
|
||||
|
||||
#elif 1 // SSE2
|
||||
|
||||
// independent mixing, then merge
|
||||
|
||||
assert(((size_t)acc) & 15 == 0);
|
||||
{ ALIGN(16) __m128i* const xacc = (__m128i *) acc;
|
||||
const __m128i* const xdata = (const __m128i *) data;
|
||||
const __m128i* const xkey = (const __m128i *) key;
|
||||
|
||||
const __m128i k1 = _mm_set1_epi32((int)(PRIME32_1-1));
|
||||
|
||||
size_t i;
|
||||
for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
|
||||
__m128i const d = _mm_loadu_si128 (xdata+i);
|
||||
__m128i const k = _mm_loadu_si128 (xkey+i);
|
||||
__m128i const dk = _mm_xor_si128 (d,k);
|
||||
|
||||
__m128i const shifted = _mm_srli_epi64(dk, 32);
|
||||
__m128i const xored = _mm_xor_si128 (dk,shifted);
|
||||
|
||||
__m128i const mul = _mm_mul_epu32 (xored, k1); /* uint64 mul[2] = {dk0*dk1, dk2*dk3} */
|
||||
__m128i const muladd = _mm_add_epi64(xored, mul);
|
||||
|
||||
xacc[i] = _mm_add_epi64(xacc[i], muladd);
|
||||
}
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
assert(((size_t)acc) & 15 == 0);
|
||||
{ ALIGN(16) __m128i* const xacc = (__m128i *) acc;
|
||||
const __m128i* const xdata = (const __m128i *) data;
|
||||
const __m128i* const xkey = (const __m128i *) key;
|
||||
|
||||
size_t i;
|
||||
for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
|
||||
__m128i const d = _mm_loadu_si128 (xdata+i);
|
||||
__m128i const k = _mm_loadu_si128 (xkey+i);
|
||||
__m128i const dk = _mm_xor_si128 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
|
||||
__m128i const mul = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31)); /* uint64 mul[2] = {dk0*dk1, dk2*dk3} */
|
||||
__m128i const add = _mm_add_epi64(d, xacc[i]);
|
||||
__m128i const h1 = _mm_srli_epi64(mul, 32);
|
||||
__m128i const h3 = _mm_shuffle_epi32(mul, _MM_SHUFFLE(0,1,0,3));
|
||||
__m128i const r64 = _mm_add_epi64(mul, add);
|
||||
__m128i const mask= _mm_mul_epu32(h1, h3);
|
||||
xacc[i] = _mm_xor_si128(r64, mask);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
/* scalar variant of Accumulator - universal */
|
||||
U64* const xacc = (U64*) acc; /* presumed aligned */
|
||||
const U64* const xdata = (const U64*) data; /* not necessarily aligned */
|
||||
const U64* const xkey = (const U64*) key; /* presumed aligned */
|
||||
U64* const xacc = (U64*) acc; /* presumed aligned */
|
||||
const U64* const xdata = (const U64*) data; /* not necessarily aligned */
|
||||
const U64* const xkey = (const U64*) key; /* presumed aligned */
|
||||
|
||||
int i;
|
||||
for (i=0; i < (int)ACC_NB; i+=2) {
|
||||
@ -805,6 +931,8 @@ XXH3_accumulate128_512bits(void* acc, const void* restrict data, const void* res
|
||||
xacc[left] *= PRIME32_2;
|
||||
xacc[right] *= PRIME32_2;
|
||||
}
|
||||
|
||||
#endif /* vect arch */
|
||||
}
|
||||
|
||||
static void XXH3_accumulate128(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes)
|
||||
|
12
xxhsum.c
12
xxhsum.c
@ -340,6 +340,8 @@ static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return
|
||||
|
||||
static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64bits(buffer, bufferSize); }
|
||||
|
||||
static U32 localXXH128(const void* buffer, size_t bufferSize, U32 seed) { return (U32)(XXH128(buffer, bufferSize, seed).low64); }
|
||||
|
||||
static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize)
|
||||
{
|
||||
U32 nbh_perIteration = (U32)((300 MB) / (bufferSize+1)) + 1; /* first loop conservatively aims for 300 MB/s */
|
||||
@ -422,7 +424,15 @@ static int BMK_benchMem(const void* buffer, size_t bufferSize, U32 specificTest)
|
||||
if ((specificTest==0) | (specificTest==6))
|
||||
BMK_benchHash(localXXH3_64b, "XXH3_64b unaligned", ((const char*)buffer)+3, bufferSize);
|
||||
|
||||
if (specificTest > 6) {
|
||||
/* Bench XXH3 */
|
||||
if ((specificTest==0) | (specificTest==7))
|
||||
BMK_benchHash(localXXH128, "XXH128", buffer, bufferSize);
|
||||
|
||||
/* Bench XXH3 on Unaligned input */
|
||||
if ((specificTest==0) | (specificTest==8))
|
||||
BMK_benchHash(localXXH128, "XXH128 unaligned", ((const char*)buffer)+3, bufferSize);
|
||||
|
||||
if (specificTest > 8) {
|
||||
DISPLAY("Benchmark mode invalid.\n");
|
||||
return 1;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user