Improve aarch64 code. There is no longer need to disable NEON on GCC 6

This new code is faster and vectorizes properly on GCC 6.

Apparently, aarch64 really hates shuffling.
This commit is contained in:
easyaspi314 (Devin) 2019-03-02 19:25:31 -05:00
parent d034ce8269
commit 8a08cbc10c

35
xxh3.h
View File

@ -49,11 +49,6 @@
# define XXH_VECTOR XXH_AVX2
# elif defined(__SSE2__)
# define XXH_VECTOR XXH_SSE2
/* GCC < 7 for aarch64 generates unreasonably slow code for the NEON
* implementation. We fall back to the scalar version and emit a warning. */
# elif defined(__aarch64__) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 7
# warning Your GCC version has broken NEON support. Please use GCC 7+ or Clang.
# define XXH_VECTOR XXH_SCALAR
/* msvc support maybe later */
# elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
# define XXH_VECTOR XXH_NEON
@ -320,23 +315,26 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
* | v8.2s (.val[0]) | <zero> | v9.2s (.val[1]) | <zero> |
* '-----------------'----------------'-----------------'-----------------'
* On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting
* is not going to help us here, as half of it will end up being zero. */
* is not going to help us here, as half of it will end up being zero.
*
* Even if it did, aarch64 apparently does really bad with shuffling, so
* we use a different method. */
uint32x2x2_t d = vld2_u32(xdata + i * 4); /* load and swap */
uint32x2x2_t k = vld2_u32(xkey + i * 4);
/* Not sorry about breaking the strict aliasing rule.
* Using a union causes GCC to spit out nonsense, but an alias cast
* does not. */
uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);
xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));
uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); /* dk = d + k */
xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); /* xacc[i] += (U64)dkLo * (U64)dkHi; */
#else
/* Portable, but slightly slower version */
uint32x2x2_t const d = vld2_u32(xdata + i * 4);
uint32x2x2_t const k = vld2_u32(xkey + i * 4);
uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]);
uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
/* xacc must be aligned on 16 bytes boundaries */
xacc[i] = vmlal_u32(xacc[i], dkL, dkH); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
/* A portable and aarch64-friendly version. It is slower on ARMv7a, though. */
uint32x4_t d = vld1q_u32(xdata + i * 4);
uint32x4_t k = vld1q_u32(xkey + i * 4);
/* Add d and k, then reinterpret to a uint64x2_t. This is not a long add. */
uint64x2_t dk = vreinterpretq_u64_u32(vaddq_u32(d, k)); /* dk = (U64)(d[1] + k[1]) << 32) | (d[0] + k[0]); */
/* Long multiply high and low bits. */
xacc[i] = vmlal_u32(xacc[i], vmovn_u64(dk), vshrn_n_u64(dk, 32)); /* xacc[i] += (dk & 0xFFFFFFFF) * (dk >> 32); */
#endif
}
}
@ -424,6 +422,12 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
data = veorq_u64(data, xor_p5);
{
#ifdef __aarch64__
/* aarch64 prefers this method, ARMv7a prefers the other. */
uint64x2_t k = *(uint64x2_t *)(xkey + i * 4);
uint64x2_t const dk = vmull_u32(vmovn_u64(data), vmovn_u64(k));
uint64x2_t const dk2 = vmull_u32(vshrn_n_u64(data, 32), vshrn_n_u64(k, 32));
#else
/* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */
uint32x2x2_t const d =
vzip_u32(
@ -433,6 +437,7 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
uint32x2x2_t const k = vld2_u32 (xkey+i*4); /* load and swap */
uint64x2_t const dk = vmull_u32(d.val[0],k.val[0]); /* U64 dk[2] = {d0 * k0, d2 * k2} */
uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]); /* U64 dk2[2] = {d1 * k1, d3 * k3} */
#endif
xacc[i] = veorq_u64(dk, dk2); /* xacc[i] = dk ^ dk2; */
} }
}