mirror of
https://github.com/FEX-Emu/xxHash.git
synced 2025-02-20 23:11:12 +00:00
Improve aarch64 code. There is no longer need to disable NEON on GCC 6
This new code is faster and vectorizes properly on GCC 6. Apparently, aarch64 really hates shuffling.
This commit is contained in:
parent
d034ce8269
commit
8a08cbc10c
35
xxh3.h
35
xxh3.h
@ -49,11 +49,6 @@
|
||||
# define XXH_VECTOR XXH_AVX2
|
||||
# elif defined(__SSE2__)
|
||||
# define XXH_VECTOR XXH_SSE2
|
||||
/* GCC < 7 for aarch64 generates unreasonably slow code for the NEON
|
||||
* implementation. We fall back to the scalar version and emit a warning. */
|
||||
# elif defined(__aarch64__) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 7
|
||||
# warning Your GCC version has broken NEON support. Please use GCC 7+ or Clang.
|
||||
# define XXH_VECTOR XXH_SCALAR
|
||||
/* msvc support maybe later */
|
||||
# elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
|
||||
# define XXH_VECTOR XXH_NEON
|
||||
@ -320,23 +315,26 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
|
||||
* | v8.2s (.val[0]) | <zero> | v9.2s (.val[1]) | <zero> |
|
||||
* '-----------------'----------------'-----------------'-----------------'
|
||||
* On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting
|
||||
* is not going to help us here, as half of it will end up being zero. */
|
||||
* is not going to help us here, as half of it will end up being zero.
|
||||
*
|
||||
* Even if it did, aarch64 apparently does really bad with shuffling, so
|
||||
* we use a different method. */
|
||||
|
||||
uint32x2x2_t d = vld2_u32(xdata + i * 4); /* load and swap */
|
||||
uint32x2x2_t k = vld2_u32(xkey + i * 4);
|
||||
/* Not sorry about breaking the strict aliasing rule.
|
||||
* Using a union causes GCC to spit out nonsense, but an alias cast
|
||||
* does not. */
|
||||
uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);
|
||||
xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));
|
||||
uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); /* dk = d + k */
|
||||
xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); /* xacc[i] += (U64)dkLo * (U64)dkHi; */
|
||||
#else
|
||||
/* Portable, but slightly slower version */
|
||||
uint32x2x2_t const d = vld2_u32(xdata + i * 4);
|
||||
uint32x2x2_t const k = vld2_u32(xkey + i * 4);
|
||||
uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]);
|
||||
uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
|
||||
/* xacc must be aligned on 16 bytes boundaries */
|
||||
xacc[i] = vmlal_u32(xacc[i], dkL, dkH); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
|
||||
/* A portable and aarch64-friendly version. It is slower on ARMv7a, though. */
|
||||
uint32x4_t d = vld1q_u32(xdata + i * 4);
|
||||
uint32x4_t k = vld1q_u32(xkey + i * 4);
|
||||
/* Add d and k, then reinterpret to a uint64x2_t. This is not a long add. */
|
||||
uint64x2_t dk = vreinterpretq_u64_u32(vaddq_u32(d, k)); /* dk = (U64)(d[1] + k[1]) << 32) | (d[0] + k[0]); */
|
||||
/* Long multiply high and low bits. */
|
||||
xacc[i] = vmlal_u32(xacc[i], vmovn_u64(dk), vshrn_n_u64(dk, 32)); /* xacc[i] += (dk & 0xFFFFFFFF) * (dk >> 32); */
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -424,6 +422,12 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
|
||||
data = veorq_u64(data, xor_p5);
|
||||
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
/* aarch64 prefers this method, ARMv7a prefers the other. */
|
||||
uint64x2_t k = *(uint64x2_t *)(xkey + i * 4);
|
||||
uint64x2_t const dk = vmull_u32(vmovn_u64(data), vmovn_u64(k));
|
||||
uint64x2_t const dk2 = vmull_u32(vshrn_n_u64(data, 32), vshrn_n_u64(k, 32));
|
||||
#else
|
||||
/* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */
|
||||
uint32x2x2_t const d =
|
||||
vzip_u32(
|
||||
@ -433,6 +437,7 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
|
||||
uint32x2x2_t const k = vld2_u32 (xkey+i*4); /* load and swap */
|
||||
uint64x2_t const dk = vmull_u32(d.val[0],k.val[0]); /* U64 dk[2] = {d0 * k0, d2 * k2} */
|
||||
uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]); /* U64 dk2[2] = {d1 * k1, d3 * k3} */
|
||||
#endif
|
||||
xacc[i] = veorq_u64(dk, dk2); /* xacc[i] = dk ^ dk2; */
|
||||
} }
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user