2019-03-12 18:44:44 +00:00
/*
xxHash - Extremely Fast Hash algorithm
Development source file for ` xxh3 `
Copyright ( C ) 2019 - present , Yann Collet .
BSD 2 - Clause License ( http : //www.opensource.org/licenses/bsd-license.php)
Redistribution and use in source and binary forms , with or without
modification , are permitted provided that the following conditions are
met :
* Redistributions of source code must retain the above copyright
notice , this list of conditions and the following disclaimer .
* Redistributions in binary form must reproduce the above
copyright notice , this list of conditions and the following disclaimer
in the documentation and / or other materials provided with the
distribution .
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
" AS IS " AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT
LIMITED TO , THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT , INDIRECT , INCIDENTAL ,
SPECIAL , EXEMPLARY , OR CONSEQUENTIAL DAMAGES ( INCLUDING , BUT NOT
LIMITED TO , PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE ,
DATA , OR PROFITS ; OR BUSINESS INTERRUPTION ) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY , WHETHER IN CONTRACT , STRICT LIABILITY , OR TORT
( INCLUDING NEGLIGENCE OR OTHERWISE ) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE , EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE .
You can contact the author at :
- xxHash source repository : https : //github.com/Cyan4973/xxHash
*/
/* Note :
This file is separated for development purposes .
It will be integrated into ` xxhash . c ` when development phase is complete .
*/
2019-02-26 20:36:23 +00:00
# ifndef XXH3_H
# define XXH3_H
2019-02-26 23:24:59 +00:00
/* === Dependencies === */
2019-02-26 21:45:56 +00:00
# undef XXH_INLINE_ALL /* in case it's already defined */
2019-02-26 20:36:23 +00:00
# define XXH_INLINE_ALL
# include "xxhash.h"
2019-03-16 15:56:26 +00:00
# undef NDEBUG /* avoid redefinition */
2019-02-26 20:36:23 +00:00
# define NDEBUG
# include <assert.h>
2019-02-26 23:24:59 +00:00
/* === Compiler versions === */
# if !(defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) /* C99+ */
# define restrict /* disable */
# endif
2019-02-27 00:36:03 +00:00
# if defined(__GNUC__)
# if defined(__SSE2__)
# include <x86intrin.h>
2019-03-01 01:28:29 +00:00
# elif defined(__ARM_NEON__) || defined(__ARM_NEON)
# define inline __inline__ /* clang bug */
# include <arm_neon.h>
# undef inline
2019-02-27 00:36:03 +00:00
# endif
# define ALIGN(n) __attribute__ ((aligned(n)))
# elif defined(_MSC_VER)
# include <intrin.h>
# define ALIGN(n) __declspec(align(n))
# else
2019-02-27 02:38:20 +00:00
# define ALIGN(n) /* disabled */
2019-02-27 00:36:03 +00:00
# endif
2019-02-26 23:24:59 +00:00
2019-02-26 21:45:56 +00:00
/* ==========================================
* Vectorization detection
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
2019-02-26 20:36:23 +00:00
# define XXH_SCALAR 0
# define XXH_SSE2 1
# define XXH_AVX2 2
2019-03-01 01:28:29 +00:00
# define XXH_NEON 3
2019-02-26 20:36:23 +00:00
# ifndef XXH_VECTOR /* can be defined on command line */
# if defined(__AVX2__)
# define XXH_VECTOR XXH_AVX2
# elif defined(__SSE2__)
# define XXH_VECTOR XXH_SSE2
2019-03-01 01:28:29 +00:00
/* msvc support maybe later */
2019-03-19 19:51:29 +00:00
# elif defined(__GNUC__) \
& & ( defined ( __ARM_NEON__ ) | | defined ( __ARM_NEON ) ) \
& & defined ( __LITTLE_ENDIAN__ ) /* ARM big endian is a thing */
2019-03-01 01:28:29 +00:00
# define XXH_VECTOR XXH_NEON
2019-02-26 20:36:23 +00:00
# else
# define XXH_VECTOR XXH_SCALAR
# endif
# endif
2019-03-07 22:26:49 +00:00
/* U64 XXH_mult32to64(U32 a, U64 b) { return (U64)a * (U64)b; } */
# ifdef _MSC_VER
# include <intrin.h>
/* MSVC doesn't do a good job with the mull detection. */
# define XXH_mult32to64 __emulu
# else
# define XXH_mult32to64(x, y) ((U64)((x) & 0xFFFFFFFF) * (U64)((y) & 0xFFFFFFFF))
# endif
2019-02-26 23:24:59 +00:00
2019-03-06 16:55:48 +00:00
2019-02-26 21:45:56 +00:00
/* ==========================================
2019-03-06 16:55:48 +00:00
* XXH3 default settings
2019-02-26 21:45:56 +00:00
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
2019-02-26 20:36:23 +00:00
2019-03-06 16:55:48 +00:00
# define KEYSET_DEFAULT_SIZE 48 /* minimum 32 */
2019-02-26 20:36:23 +00:00
2019-03-06 16:55:48 +00:00
ALIGN ( 64 ) static const U32 kKey [ KEYSET_DEFAULT_SIZE ] = {
0xb8fe6c39 , 0x23a44bbe , 0x7c01812c , 0xf721ad1c ,
0xded46de9 , 0x839097db , 0x7240a4a4 , 0xb7b3671f ,
0xcb79e64e , 0xccc0e578 , 0x825ad07d , 0xccff7221 ,
0xb8084674 , 0xf743248e , 0xe03590e6 , 0x813a264c ,
0x3c2852bb , 0x91c300cb , 0x88d0658b , 0x1b532ea3 ,
0x71644897 , 0xa20df94e , 0x3819ef46 , 0xa9deacd8 ,
0xa8fa763f , 0xe39c343f , 0xf9dcbbc7 , 0xc70b4f1d ,
0x8a51e04b , 0xcdb45931 , 0xc89f7ec9 , 0xd9787364 ,
2019-02-26 20:36:23 +00:00
2019-03-06 16:55:48 +00:00
0xeac5ac83 , 0x34d3ebc3 , 0xc581a0ff , 0xfa1363eb ,
0x170ddd51 , 0xb7f0da49 , 0xd3165526 , 0x29d4689e ,
0x2b16be58 , 0x7d47a1fc , 0x8ff8b8d1 , 0x7ad031ce ,
0x45cb3a8f , 0x95160428 , 0xafd7fbca , 0xbb4b407e ,
} ;
2019-02-26 20:36:23 +00:00
2019-03-07 04:42:04 +00:00
2019-03-07 22:26:49 +00:00
# if defined(__GNUC__) && defined(__i386__)
/* GCC is stupid and tries to vectorize this.
* This tells GCC that it is wrong . */
__attribute__ ( ( __target__ ( " no-sse " ) ) )
# endif
static U64
2019-03-18 02:05:14 +00:00
XXH3_mul128_fold64 ( U64 ll1 , U64 ll2 )
2019-02-26 20:36:23 +00:00
{
2019-03-08 00:51:39 +00:00
# if defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
2019-03-07 04:42:04 +00:00
__uint128_t lll = ( __uint128_t ) ll1 * ll2 ;
2019-03-25 02:08:54 +00:00
return ( U64 ) lll ^ ( U64 ) ( lll > > 64 ) ;
2019-03-07 04:42:04 +00:00
# elif defined(_M_X64) || defined(_M_IA64)
2019-03-16 15:56:26 +00:00
2019-03-16 13:59:46 +00:00
# ifndef _MSC_VER
2019-03-07 04:42:04 +00:00
# pragma intrinsic(_umul128)
2019-03-16 13:59:46 +00:00
# endif
2019-03-07 04:42:04 +00:00
U64 llhigh ;
U64 const lllow = _umul128 ( ll1 , ll2 , & llhigh ) ;
2019-03-25 02:08:54 +00:00
return lllow ^ llhigh ;
2019-03-07 04:42:04 +00:00
2019-03-07 22:26:49 +00:00
# elif defined(__aarch64__) && defined(__GNUC__)
2019-03-07 04:42:04 +00:00
2019-03-07 22:26:49 +00:00
U64 llow ;
2019-03-07 04:42:04 +00:00
U64 llhigh ;
2019-03-07 22:26:49 +00:00
__asm__ ( " umulh %0, %1, %2 " : " =r " ( llhigh ) : " r " ( ll1 ) , " r " ( ll2 ) ) ;
2019-03-25 02:08:54 +00:00
__asm__ ( " madd %0, %1, %2, %3 " : " =r " ( llow ) : " r " ( ll1 ) , " r " ( ll2 ) , " r " ( llhigh ) ) ; /* <=================== to be modified => xor instead of add */
2019-03-07 22:26:49 +00:00
return lllow ;
2019-03-07 04:42:04 +00:00
2019-03-07 22:26:49 +00:00
/* Do it out manually on 32-bit.
* This is a modified , unrolled , widened , and optimized version of the
* mulqdu routine from Hacker ' s Delight .
*
* https : //www.hackersdelight.org/hdcodetxt/mulqdu.c.txt
*
* This was modified to use U32 - > U64 multiplication instead
* of U16 - > U32 , to add the high and low values in the end ,
* be endian - independent , and I added a partial assembly
* implementation for ARM . */
2019-03-08 20:54:41 +00:00
2019-03-07 22:26:49 +00:00
/* An easy 128-bit folding multiply on ARMv6T2 and ARMv7-A/R can be done with
* the mighty umaal ( Unsigned Multiply Accumulate Accumulate Long ) which takes 4 cycles
* or less , doing a long multiply and adding two 32 - bit integers :
*
* void umaal ( U32 * RdLo , U32 * RdHi , U32 Rn , U32 Rm )
* {
* U64 prodAcc = ( U64 ) Rn * ( U64 ) Rm ;
* prodAcc + = * RdLo ;
* prodAcc + = * RdHi ;
* * RdLo = prodAcc & 0xFFFFFFFF ;
* * RdHi = prodAcc > > 32 ;
* }
*
* This is compared to umlal which adds to a single 64 - bit integer :
*
* void umlal ( U32 * RdLo , U32 * RdHi , U32 Rn , U32 Rm )
* {
* U64 prodAcc = ( U64 ) Rn * ( U64 ) Rm ;
* prodAcc + = ( * RdLo | ( ( U64 ) * RdHi < < 32 ) ;
* * RdLo = prodAcc & 0xFFFFFFFF ;
* * RdHi = prodAcc > > 32 ;
* }
*
* Getting the compiler to emit them is like pulling teeth , and checking
* for it is annoying because ARMv7 - M lacks this instruction . However , it
* is worth it , because this is an otherwise expensive operation . */
/* GCC-compatible, ARMv6t2 or ARMv7+, non-M variant, and 32-bit */
2019-03-08 20:54:41 +00:00
# elif defined(__GNUC__) /* GCC-compatible */ \
2019-03-07 22:26:49 +00:00
& & defined ( __ARM_ARCH ) & & ! defined ( __aarch64__ ) & & ! defined ( __arm64__ ) /* 32-bit ARM */ \
& & ! defined ( __ARM_ARCH_7M__ ) /* <- Not ARMv7-M vv*/ \
& & ! ( defined ( __TARGET_ARCH_ARM ) & & __TARGET_ARCH_ARM = = 0 & & __TARGET_ARCH_THUMB = = 4 ) \
& & ( defined ( __ARM_ARCH_6T2__ ) | | __ARM_ARCH > 6 ) /* ARMv6T2 or later */
2019-03-08 20:54:41 +00:00
U32 w [ 4 ] = { 0 } ;
U32 u [ 2 ] = { ( U32 ) ( ll1 > > 32 ) , ( U32 ) ll1 } ;
U32 v [ 2 ] = { ( U32 ) ( ll2 > > 32 ) , ( U32 ) ll2 } ;
U32 k ;
2019-03-13 02:20:45 +00:00
/* U64 t = (U64)u[1] * (U64)v[1];
* w [ 3 ] = t & 0xFFFFFFFF ;
* k = t > > 32 ; */
2019-03-07 22:26:49 +00:00
__asm__ ( " umull %0, %1, %2, %3 "
: " =r " ( w [ 3 ] ) , " =r " ( k )
: " r " ( u [ 1 ] ) , " r " ( v [ 1 ] ) ) ;
2019-03-13 02:20:45 +00:00
/* t = (U64)u[0] * (U64)v[1] + w[2] + k;
* w [ 2 ] = t & 0xFFFFFFFF ;
* k = t > > 32 ; */
2019-03-07 22:26:49 +00:00
__asm__ ( " umaal %0, %1, %2, %3 "
: " +r " ( w [ 2 ] ) , " +r " ( k )
: " r " ( u [ 0 ] ) , " r " ( v [ 1 ] ) ) ;
w [ 1 ] = k ;
k = 0 ;
2019-03-13 02:20:45 +00:00
/* t = (U64)u[1] * (U64)v[0] + w[2] + k;
* w [ 2 ] = t & 0xFFFFFFFF ;
* k = t > > 32 ; */
2019-03-07 22:26:49 +00:00
__asm__ ( " umaal %0, %1, %2, %3 "
: " +r " ( w [ 2 ] ) , " +r " ( k )
: " r " ( u [ 1 ] ) , " r " ( v [ 0 ] ) ) ;
2019-03-13 02:20:45 +00:00
/* t = (U64)u[0] * (U64)v[0] + w[1] + k;
* w [ 1 ] = t & 0xFFFFFFFF ;
* k = t > > 32 ; */
2019-03-07 22:26:49 +00:00
__asm__ ( " umaal %0, %1, %2, %3 "
: " +r " ( w [ 1 ] ) , " +r " ( k )
: " r " ( u [ 0 ] ) , " r " ( v [ 0 ] ) ) ;
w [ 0 ] = k ;
2019-03-08 20:54:41 +00:00
2019-03-25 02:08:54 +00:00
return ( w [ 1 ] | ( ( U64 ) w [ 0 ] < < 32 ) ) ^ ( w [ 3 ] | ( ( U64 ) w [ 2 ] < < 32 ) ) ;
2019-03-07 04:42:04 +00:00
2019-03-08 20:54:41 +00:00
# else /* Portable scalar version */
2019-03-07 04:42:04 +00:00
2019-03-08 20:37:06 +00:00
/* emulate 64x64->128b multiplication, using four 32x32->64 */
2019-03-12 19:56:52 +00:00
U32 const h1 = ( U32 ) ( ll1 > > 32 ) ;
U32 const h2 = ( U32 ) ( ll2 > > 32 ) ;
2019-03-07 04:42:04 +00:00
U32 const l1 = ( U32 ) ll1 ;
U32 const l2 = ( U32 ) ll2 ;
2019-03-08 20:54:41 +00:00
U64 const llh = XXH_mult32to64 ( h1 , h2 ) ;
2019-03-09 03:32:11 +00:00
U64 const llm1 = XXH_mult32to64 ( l1 , h2 ) ;
U64 const llm2 = XXH_mult32to64 ( h1 , l2 ) ;
U64 const lll = XXH_mult32to64 ( l1 , l2 ) ;
2019-03-07 04:42:04 +00:00
U64 const t = lll + ( llm1 < < 32 ) ;
U64 const carry1 = t < lll ;
U64 const lllow = t + ( llm2 < < 32 ) ;
U64 const carry2 = lllow < t ;
U64 const llhigh = llh + ( llm1 > > 32 ) + ( llm2 > > 32 ) + carry1 + carry2 ;
2019-03-25 02:08:54 +00:00
return llhigh ^ lllow ;
2019-03-07 04:42:04 +00:00
# endif
2019-02-26 20:36:23 +00:00
}
2019-03-07 04:42:04 +00:00
2019-03-12 21:21:24 +00:00
static XXH64_hash_t XXH3_avalanche ( U64 h64 )
2019-03-06 16:55:48 +00:00
{
2019-03-23 19:26:16 +00:00
h64 ^ = h64 > > 37 ;
2019-03-06 16:55:48 +00:00
h64 * = PRIME64_3 ;
h64 ^ = h64 > > 32 ;
return h64 ;
}
2019-02-26 20:36:23 +00:00
2019-03-06 22:46:42 +00:00
2019-03-06 16:55:48 +00:00
/* ==========================================
* Short keys
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
2019-04-30 00:44:45 +00:00
//#undef XXH_FORCE_INLINE // disable for debug (print target line nb)
//#define XXH_FORCE_INLINE
2019-03-12 21:21:24 +00:00
2019-04-24 23:25:10 +00:00
XXH_FORCE_INLINE U64
XXH3_readKey64 ( const void * ptr )
{
assert ( ( ( size_t ) ptr & 7 ) = = 0 ) ; /* aligned on 8-bytes boundaries */
if ( XXH_CPU_LITTLE_ENDIAN ) {
return * ( const U64 * ) ptr ;
} else {
const U32 * const ptr32 = ( const U32 * ) ptr ;
return ( U64 ) ptr32 [ 0 ] + ( ( ( U64 ) ptr32 [ 1 ] ) < < 32 ) ;
}
}
2019-03-06 22:46:42 +00:00
XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_1to3_64b ( const void * data , size_t len , const void * keyPtr , XXH64_hash_t seed )
2019-02-26 20:36:23 +00:00
{
assert ( data ! = NULL ) ;
assert ( len > 0 & & len < = 3 ) ;
2019-03-06 16:55:48 +00:00
assert ( keyPtr ! = NULL ) ;
2019-04-24 23:25:10 +00:00
{ BYTE const c1 = ( ( const BYTE * ) data ) [ 0 ] ;
2019-02-26 21:45:56 +00:00
BYTE const c2 = ( ( const BYTE * ) data ) [ len > > 1 ] ;
BYTE const c3 = ( ( const BYTE * ) data ) [ len - 1 ] ;
2019-04-24 23:25:10 +00:00
U32 const combined = ( ( U32 ) c1 ) + ( ( ( U32 ) c2 ) < < 8 ) + ( ( ( U32 ) c3 ) < < 16 ) + ( ( ( U32 ) len ) < < 24 ) ;
U64 const keyed = ( U64 ) combined ^ ( XXH3_readKey64 ( keyPtr ) + seed ) ;
U64 const mixed = keyed * PRIME64_1 ;
return XXH3_avalanche ( mixed ) ;
2019-02-26 21:45:56 +00:00
}
2019-02-26 20:36:23 +00:00
}
2019-03-27 19:35:02 +00:00
XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_4to8_64b ( const void * data , size_t len , const void * keyPtr , XXH64_hash_t seed )
{
assert ( data ! = NULL ) ;
assert ( key ! = NULL ) ;
assert ( len > = 4 & & len < = 8 ) ;
{ U32 const in1 = XXH_readLE32 ( data ) ;
U32 const in2 = XXH_readLE32 ( ( const BYTE * ) data + len - 4 ) ;
U64 const in64 = in1 + ( ( U64 ) in2 < < 32 ) ;
U64 const keyed = in64 ^ ( XXH3_readKey64 ( keyPtr ) + seed ) ;
U64 const mix64 = len + XXH3_mul128_fold64 ( keyed , PRIME64_1 ) ;
return XXH3_avalanche ( mix64 ) ;
}
}
2019-03-06 22:46:42 +00:00
XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_9to16_64b ( const void * data , size_t len , const void * keyPtr , XXH64_hash_t seed )
2019-02-26 20:36:23 +00:00
{
assert ( data ! = NULL ) ;
2019-03-06 16:55:48 +00:00
assert ( key ! = NULL ) ;
2019-02-26 20:36:23 +00:00
assert ( len > = 9 & & len < = 16 ) ;
2019-03-06 16:55:48 +00:00
{ const U64 * const key64 = ( const U64 * ) keyPtr ;
2019-03-25 02:08:54 +00:00
U64 const ll1 = XXH_readLE64 ( data ) ^ ( XXH3_readKey64 ( key64 ) + seed ) ;
U64 const ll2 = XXH_readLE64 ( ( const BYTE * ) data + len - 8 ) ^ ( XXH3_readKey64 ( key64 + 1 ) - seed ) ;
U64 const acc = len + ( ll1 + ll2 ) + XXH3_mul128_fold64 ( ll1 , ll2 ) ;
2019-03-12 21:21:24 +00:00
return XXH3_avalanche ( acc ) ;
2019-02-26 21:45:56 +00:00
}
2019-02-26 20:36:23 +00:00
}
2019-03-06 22:46:42 +00:00
XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_0to16_64b ( const void * data , size_t len , XXH64_hash_t seed )
2019-02-26 20:36:23 +00:00
{
assert ( data ! = NULL ) ;
2019-03-06 16:55:48 +00:00
assert ( len < = 16 ) ;
2019-03-06 22:46:42 +00:00
{ if ( len > 8 ) return XXH3_len_9to16_64b ( data , len , kKey , seed ) ;
if ( len > = 4 ) return XXH3_len_4to8_64b ( data , len , kKey , seed ) ;
if ( len ) return XXH3_len_1to3_64b ( data , len , kKey , seed ) ;
return seed ;
2019-02-26 21:45:56 +00:00
}
2019-02-26 20:36:23 +00:00
}
2019-03-12 21:21:24 +00:00
/* === Long Keys === */
2019-02-26 20:36:23 +00:00
# define STRIPE_LEN 64
# define STRIPE_ELTS (STRIPE_LEN / sizeof(U32))
# define ACC_NB (STRIPE_LEN / sizeof(U64))
2019-02-26 23:24:59 +00:00
XXH_FORCE_INLINE void
2019-02-26 20:36:23 +00:00
XXH3_accumulate_512 ( void * acc , const void * restrict data , const void * restrict key )
{
# if (XXH_VECTOR == XXH_AVX2)
assert ( ( ( size_t ) acc ) & 31 = = 0 ) ;
2019-03-12 18:44:44 +00:00
{ ALIGN ( 32 ) __m256i * const xacc = ( __m256i * ) acc ;
const __m256i * const xdata = ( const __m256i * ) data ;
const __m256i * const xkey = ( const __m256i * ) key ;
2019-02-26 21:45:56 +00:00
2019-03-12 18:44:44 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m256i ) ; i + + ) {
2019-02-26 21:45:56 +00:00
__m256i const d = _mm256_loadu_si256 ( xdata + i ) ;
__m256i const k = _mm256_loadu_si256 ( xkey + i ) ;
2019-03-18 18:59:50 +00:00
__m256i const dk = _mm256_xor_si256 ( d , k ) ; /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
2019-04-25 16:55:10 +00:00
__m256i const mul = _mm256_mul_epu32 ( dk , _mm256_shuffle_epi32 ( dk , 0x31 ) ) ; /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
2019-03-12 18:44:44 +00:00
__m256i const add = _mm256_add_epi64 ( d , xacc [ i ] ) ;
2019-04-25 16:55:10 +00:00
xacc [ i ] = _mm256_add_epi64 ( mul , add ) ;
2019-02-26 21:45:56 +00:00
}
2019-02-26 20:36:23 +00:00
}
# elif (XXH_VECTOR == XXH_SSE2)
assert ( ( ( size_t ) acc ) & 15 = = 0 ) ;
2019-03-12 18:44:44 +00:00
{ ALIGN ( 16 ) __m128i * const xacc = ( __m128i * ) acc ;
const __m128i * const xdata = ( const __m128i * ) data ;
const __m128i * const xkey = ( const __m128i * ) key ;
2019-02-26 21:45:56 +00:00
2019-02-26 23:24:59 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m128i ) ; i + + ) {
2019-02-26 21:45:56 +00:00
__m128i const d = _mm_loadu_si128 ( xdata + i ) ;
__m128i const k = _mm_loadu_si128 ( xkey + i ) ;
2019-03-18 18:59:50 +00:00
__m128i const dk = _mm_xor_si128 ( d , k ) ; /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
2019-04-25 16:55:10 +00:00
__m128i const mul = _mm_mul_epu32 ( dk , _mm_shuffle_epi32 ( dk , 0x31 ) ) ; /* uint64 mul[2] = {dk0*dk1,dk2*dk3} */
2019-03-12 18:44:44 +00:00
__m128i const add = _mm_add_epi64 ( d , xacc [ i ] ) ;
2019-04-25 16:55:10 +00:00
xacc [ i ] = _mm_add_epi64 ( mul , add ) ;
2019-02-26 21:45:56 +00:00
}
2019-02-26 20:36:23 +00:00
}
2019-03-18 18:59:50 +00:00
# elif (XXH_VECTOR == XXH_NEON) /* to be updated, no longer with latest sse/avx updates */
2019-03-01 01:28:29 +00:00
assert ( ( ( size_t ) acc ) & 15 = = 0 ) ;
2019-03-12 19:56:52 +00:00
{ uint64x2_t * const xacc = ( uint64x2_t * ) acc ;
2019-03-12 18:44:44 +00:00
const uint32_t * const xdata = ( const uint32_t * ) data ;
const uint32_t * const xkey = ( const uint32_t * ) key ;
2019-03-01 01:28:29 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( uint64x2_t ) ; i + + ) {
2019-03-13 02:20:45 +00:00
uint32x4_t const d = vld1q_u32 ( xdata + i * 4 ) ; /* U32 d[4] = xdata[i]; */
uint32x4_t const k = vld1q_u32 ( xkey + i * 4 ) ; /* U32 k[4] = xkey[i]; */
2019-03-19 19:51:29 +00:00
uint32x4_t dk = veorq_u32 ( d , k ) ; /* U32 dk[4] = {d0^k0, d1^k1, d2^k2, d3^k3} */
2019-03-13 02:20:45 +00:00
# if !defined(__aarch64__) && !defined(__arm64__) /* ARM32-specific hack */
/* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
* vzip on 32 - bit ARM NEON will overwrite the original register , and I think that Clang
* assumes I don ' t want to destroy it and tries to make a copy . This slows down the code
* a lot .
* aarch64 not only uses an entirely different syntax , but it requires three
* instructions . . .
* ext v1 .16 B , v0 .16 B , # 8 // select high bits because aarch64 can't address them directly
* zip1 v3 .2 s , v0 .2 s , v1 .2 s // first zip
* zip2 v2 .2 s , v0 .2 s , v1 .2 s // second zip
* . . . to do what ARM does in one :
* vzip .32 d0 , d1 // Interleave high and low bits and overwrite. */
__asm__ ( " vzip.32 %e0, %f0 " : " +w " ( dk ) ) ; /* dk = { dk0, dk2, dk1, dk3 }; */
xacc [ i ] = vaddq_u64 ( xacc [ i ] , vreinterpretq_u64_u32 ( d ) ) ; /* xacc[i] += (U64x2)d; */
xacc [ i ] = vmlal_u32 ( xacc [ i ] , vget_low_u32 ( dk ) , vget_high_u32 ( dk ) ) ; /* xacc[i] += { (U64)dk0*dk1, (U64)dk2*dk3 }; */
# else
/* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
uint32x2_t dkL = vmovn_u64 ( vreinterpretq_u64_u32 ( dk ) ) ; /* U32 dkL[2] = dk & 0xFFFFFFFF; */
uint32x2_t dkH = vshrn_n_u64 ( vreinterpretq_u64_u32 ( dk ) , 32 ) ; /* U32 dkH[2] = dk >> 32; */
xacc [ i ] = vaddq_u64 ( xacc [ i ] , vreinterpretq_u64_u32 ( d ) ) ; /* xacc[i] += (U64x2)d; */
xacc [ i ] = vmlal_u32 ( xacc [ i ] , dkL , dkH ) ; /* xacc[i] += (U64x2)dkL*(U64x2)dkH; */
# endif
2019-03-01 01:28:29 +00:00
}
}
2019-03-12 18:44:44 +00:00
2019-03-25 22:25:24 +00:00
# else /* scalar variant of Accumulator - universal */
2019-02-26 20:36:23 +00:00
2019-03-12 21:21:24 +00:00
U64 * const xacc = ( U64 * ) acc ; /* presumed aligned */
2019-02-26 20:36:23 +00:00
const U32 * const xdata = ( const U32 * ) data ;
const U32 * const xkey = ( const U32 * ) key ;
int i ;
for ( i = 0 ; i < ( int ) ACC_NB ; i + + ) {
int const left = 2 * i ;
int const right = 2 * i + 1 ;
2019-03-12 21:21:24 +00:00
U32 const dataLeft = XXH_readLE32 ( xdata + left ) ;
U32 const dataRight = XXH_readLE32 ( xdata + right ) ;
2019-03-18 18:59:50 +00:00
xacc [ i ] + = XXH_mult32to64 ( dataLeft ^ xkey [ left ] , dataRight ^ xkey [ right ] ) ;
2019-03-12 21:21:24 +00:00
xacc [ i ] + = dataLeft + ( ( U64 ) dataRight < < 32 ) ;
2019-02-26 20:36:23 +00:00
}
# endif
}
static void XXH3_scrambleAcc ( void * acc , const void * key )
{
# if (XXH_VECTOR == XXH_AVX2)
assert ( ( ( size_t ) acc ) & 31 = = 0 ) ;
2019-03-12 18:44:44 +00:00
{ ALIGN ( 32 ) __m256i * const xacc = ( __m256i * ) acc ;
const __m256i * const xkey = ( const __m256i * ) key ;
2019-03-17 19:31:42 +00:00
const __m256i k1 = _mm256_set1_epi32 ( ( int ) PRIME32_1 ) ;
2019-02-26 21:45:56 +00:00
2019-03-12 18:44:44 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m256i ) ; i + + ) {
2019-02-26 21:45:56 +00:00
__m256i data = xacc [ i ] ;
__m256i const shifted = _mm256_srli_epi64 ( data , 47 ) ;
data = _mm256_xor_si256 ( data , shifted ) ;
2019-02-26 20:36:23 +00:00
2019-02-26 21:45:56 +00:00
{ __m256i const k = _mm256_loadu_si256 ( xkey + i ) ;
2019-03-25 22:25:24 +00:00
__m256i const dk = _mm256_xor_si256 ( data , k ) ; /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
2019-03-17 19:31:42 +00:00
2019-03-25 22:25:24 +00:00
__m256i const dk1 = _mm256_mul_epu32 ( dk , k1 ) ;
2019-02-26 20:36:23 +00:00
2019-03-17 19:31:42 +00:00
__m256i const d2 = _mm256_shuffle_epi32 ( dk , 0x31 ) ;
2019-03-25 22:25:24 +00:00
__m256i const dk2 = _mm256_mul_epu32 ( d2 , k1 ) ;
__m256i const dk2h = _mm256_slli_epi64 ( dk2 , 32 ) ;
2019-02-26 20:36:23 +00:00
2019-03-25 22:26:54 +00:00
xacc [ i ] = _mm256_add_epi64 ( dk1 , dk2h ) ;
2019-02-26 21:45:56 +00:00
} }
2019-02-26 20:36:23 +00:00
}
# elif (XXH_VECTOR == XXH_SSE2)
2019-03-12 18:44:44 +00:00
{ ALIGN ( 16 ) __m128i * const xacc = ( __m128i * ) acc ;
const __m128i * const xkey = ( const __m128i * ) key ;
2019-03-17 19:31:42 +00:00
const __m128i k1 = _mm_set1_epi32 ( ( int ) PRIME32_1 ) ;
2019-02-26 20:36:23 +00:00
2019-02-26 23:24:59 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m128i ) ; i + + ) {
2019-02-26 21:45:56 +00:00
__m128i data = xacc [ i ] ;
__m128i const shifted = _mm_srli_epi64 ( data , 47 ) ;
data = _mm_xor_si128 ( data , shifted ) ;
2019-02-26 20:36:23 +00:00
2019-02-26 21:45:56 +00:00
{ __m128i const k = _mm_loadu_si128 ( xkey + i ) ;
2019-03-17 19:31:42 +00:00
__m128i const dk = _mm_xor_si128 ( data , k ) ;
__m128i const dk1 = _mm_mul_epu32 ( dk , k1 ) ;
2019-02-26 20:36:23 +00:00
2019-03-17 19:31:42 +00:00
__m128i const d2 = _mm_shuffle_epi32 ( dk , 0x31 ) ;
2019-03-25 22:25:24 +00:00
__m128i const dk2 = _mm_mul_epu32 ( d2 , k1 ) ;
__m128i const dk2h = _mm_slli_epi64 ( dk2 , 32 ) ;
2019-02-26 20:36:23 +00:00
2019-03-25 22:25:24 +00:00
xacc [ i ] = _mm_add_epi64 ( dk1 , dk2h ) ;
2019-02-26 21:45:56 +00:00
} }
2019-02-26 20:36:23 +00:00
}
2019-04-25 17:44:57 +00:00
# elif 0 && (XXH_VECTOR == XXH_NEON) /* <============================================ Disabled : Needs update !!!!!!!!!!! */
2019-03-01 01:28:29 +00:00
assert ( ( ( size_t ) acc ) & 15 = = 0 ) ;
2019-03-13 02:20:45 +00:00
{ uint64x2_t * const xacc = ( uint64x2_t * ) acc ;
const uint32_t * const xkey = ( const uint32_t * ) key ;
2019-03-01 01:28:29 +00:00
size_t i ;
2019-03-19 19:51:29 +00:00
uint32x2_t const k1 = vdup_n_u32 ( PRIME32_1 ) ;
uint32x2_t const k2 = vdup_n_u32 ( PRIME32_2 ) ;
2019-03-13 02:20:45 +00:00
2019-03-01 01:28:29 +00:00
for ( i = 0 ; i < STRIPE_LEN / sizeof ( uint64x2_t ) ; i + + ) {
uint64x2_t data = xacc [ i ] ;
2019-03-13 02:20:45 +00:00
uint64x2_t const shifted = vshrq_n_u64 ( data , 47 ) ; /* uint64 shifted[2] = data >> 47; */
data = veorq_u64 ( data , shifted ) ; /* data ^= shifted; */
2019-03-01 01:28:29 +00:00
{
2019-03-19 19:51:29 +00:00
uint32x4_t const k = vld1q_u32 ( xkey + i * 4 ) ; /* load */
uint32x4_t const dk = veorq_u32 ( vreinterpretq_u32_u64 ( data ) , k ) ; /* dk = data ^ key */
2019-03-01 01:28:29 +00:00
/* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */
2019-03-19 19:51:29 +00:00
uint32x2x2_t const split = vzip_u32 ( vget_low_u32 ( dk ) , vget_high_u32 ( dk ) ) ;
uint64x2_t const dk1 = vmull_u32 ( split . val [ 0 ] , k1 ) ; /* U64 dk[2] = {(U64)d0*k0, (U64)d2*k2} */
uint64x2_t const dk2 = vmull_u32 ( split . val [ 1 ] , k2 ) ; /* U64 dk2[2] = {(U64)d1*k1, (U64)d3*k3} */
xacc [ i ] = veorq_u64 ( dk1 , dk2 ) ; /* xacc[i] = dk^dk2; */
2019-03-01 01:28:29 +00:00
} }
}
2019-03-25 22:25:24 +00:00
# else /* scalar variant of Scrambler - universal */
2019-02-26 20:36:23 +00:00
2019-02-26 21:45:56 +00:00
U64 * const xacc = ( U64 * ) acc ;
2019-04-25 00:38:07 +00:00
const U64 * const xkey = ( const U64 * ) key ;
2019-02-26 20:36:23 +00:00
int i ;
2019-03-25 22:25:24 +00:00
assert ( ( ( size_t ) acc ) & 7 = = 0 ) ;
2019-02-26 20:36:23 +00:00
for ( i = 0 ; i < ( int ) ACC_NB ; i + + ) {
2019-04-25 00:38:07 +00:00
U64 const key64 = XXH3_readKey64 ( xkey + i ) ;
2019-03-25 22:25:24 +00:00
U64 acc64 = xacc [ i ] ;
acc64 ^ = acc64 > > 47 ;
2019-03-25 22:33:09 +00:00
acc64 ^ = key64 ;
2019-03-25 22:25:24 +00:00
acc64 * = PRIME32_1 ;
xacc [ i ] = acc64 ;
}
2019-02-26 20:36:23 +00:00
# endif
}
2019-04-30 00:44:45 +00:00
static void XXH3_accumulate ( U64 * restrict acc ,
const void * restrict data , size_t nbStripes ,
const U32 * restrict key )
2019-02-26 20:36:23 +00:00
{
2019-02-26 23:24:59 +00:00
size_t n ;
2019-03-13 00:36:37 +00:00
/* Clang doesn't unroll this loop without the pragma. Unrolling can be up to 1.4x faster. */
# if defined(__clang__) && !defined(__OPTIMIZE_SIZE__)
# pragma clang loop unroll(enable)
# endif
2019-02-26 23:24:59 +00:00
for ( n = 0 ; n < nbStripes ; n + + ) {
2019-04-30 00:44:45 +00:00
XXH3_accumulate_512 ( acc ,
( const BYTE * ) data + n * STRIPE_LEN ,
key + n * 2 ) ;
2019-02-26 20:36:23 +00:00
}
}
2019-03-08 20:37:06 +00:00
static void
2019-04-30 00:44:45 +00:00
XXH3_hashLong ( U64 * restrict acc ,
const void * restrict data , size_t len )
2019-02-26 20:36:23 +00:00
{
2019-04-30 00:44:45 +00:00
# define NB_STRIPES_PER_ROUND ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2)
2019-02-26 20:36:23 +00:00
2019-04-30 00:44:45 +00:00
size_t const block_len = STRIPE_LEN * NB_STRIPES_PER_ROUND ;
2019-02-26 20:36:23 +00:00
size_t const nb_blocks = len / block_len ;
2019-02-26 23:24:59 +00:00
size_t n ;
for ( n = 0 ; n < nb_blocks ; n + + ) {
2019-04-30 00:44:45 +00:00
XXH3_accumulate ( acc , ( const BYTE * ) data + n * block_len , NB_STRIPES_PER_ROUND , kKey ) ;
2019-02-26 20:36:23 +00:00
XXH3_scrambleAcc ( acc , kKey + ( KEYSET_DEFAULT_SIZE - STRIPE_ELTS ) ) ;
}
/* last partial block */
assert ( len > STRIPE_LEN ) ;
2019-02-26 21:45:56 +00:00
{ size_t const nbStripes = ( len % block_len ) / STRIPE_LEN ;
2019-04-30 00:44:45 +00:00
assert ( nbStripes < NB_STRIPES_PER_ROUND ) ;
XXH3_accumulate ( acc , ( const BYTE * ) data + nb_blocks * block_len , nbStripes , kKey ) ;
2019-02-26 21:45:56 +00:00
/* last stripe */
if ( len & ( STRIPE_LEN - 1 ) ) {
const BYTE * const p = ( const BYTE * ) data + len - STRIPE_LEN ;
XXH3_accumulate_512 ( acc , p , kKey + nbStripes * 2 ) ;
} }
2019-03-08 20:37:06 +00:00
}
2019-03-12 21:21:24 +00:00
2019-04-30 00:44:45 +00:00
XXH_FORCE_INLINE U64 XXH3_mix2Accs ( const U64 * restrict acc , const void * restrict key )
2019-03-12 21:21:24 +00:00
{
const U64 * const key64 = ( const U64 * ) key ;
2019-03-18 02:05:14 +00:00
return XXH3_mul128_fold64 (
2019-03-12 21:21:24 +00:00
acc [ 0 ] ^ XXH3_readKey64 ( key64 ) ,
acc [ 1 ] ^ XXH3_readKey64 ( key64 + 1 ) ) ;
}
2019-04-30 00:44:45 +00:00
static XXH64_hash_t XXH3_mergeAccs ( const U64 * restrict acc , const U32 * restrict key , U64 start )
2019-03-12 21:21:24 +00:00
{
U64 result64 = start ;
result64 + = XXH3_mix2Accs ( acc + 0 , key + 0 ) ;
result64 + = XXH3_mix2Accs ( acc + 2 , key + 4 ) ;
result64 + = XXH3_mix2Accs ( acc + 4 , key + 8 ) ;
result64 + = XXH3_mix2Accs ( acc + 6 , key + 12 ) ;
return XXH3_avalanche ( result64 ) ;
}
2019-03-17 20:38:04 +00:00
XXH_FORCE_INLINE void XXH3_initKeySeed ( U32 * key , U64 seed64 )
{
U32 const seed1 = ( U32 ) seed64 ;
U32 const seed2 = ( U32 ) ( seed64 > > 32 ) ;
int i ;
assert ( KEYSET_DEFAULT_SIZE & 3 = = 0 ) ;
for ( i = 0 ; i < KEYSET_DEFAULT_SIZE ; i + = 4 ) {
key [ i + 0 ] = kKey [ i + 0 ] + seed1 ;
key [ i + 1 ] = kKey [ i + 1 ] - seed2 ;
key [ i + 2 ] = kKey [ i + 2 ] + seed2 ;
key [ i + 3 ] = kKey [ i + 3 ] - seed1 ;
}
}
2019-04-30 00:44:45 +00:00
XXH_FORCE_INLINE void XXH3_initMultipliers ( U32 * multipliers , size_t nbMult )
{
size_t s ;
U32 m = PRIME32_1 ;
for ( s = 0 ; s < nbMult ; s + + ) {
multipliers [ s ] = m - 1 ;
m * = PRIME32_2 ;
}
}
2019-03-16 13:59:46 +00:00
XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
2019-03-08 20:37:06 +00:00
XXH3_hashLong_64b ( const void * data , size_t len , XXH64_hash_t seed )
{
2019-03-16 16:35:10 +00:00
ALIGN ( 64 ) U64 acc [ ACC_NB ] = { seed , PRIME64_1 , PRIME64_2 , PRIME64_3 , PRIME64_4 , PRIME64_5 , ( U64 ) 0 - seed , 0 } ;
2019-03-17 20:38:04 +00:00
ALIGN ( 64 ) U32 key [ KEYSET_DEFAULT_SIZE ] ;
XXH3_initKeySeed ( key , seed ) ;
2019-03-08 20:37:06 +00:00
XXH3_hashLong ( acc , data , len ) ;
2019-02-26 20:36:23 +00:00
/* converge into final hash */
2019-03-06 16:55:48 +00:00
assert ( sizeof ( acc ) = = 64 ) ;
2019-03-17 20:38:04 +00:00
return XXH3_mergeAccs ( acc , key , ( U64 ) len * PRIME64_1 ) ;
2019-02-26 20:36:23 +00:00
}
2019-03-18 02:05:14 +00:00
XXH_FORCE_INLINE U64 XXH3_mix16B ( const void * data , const void * key , U64 seed64 )
{
const U64 * const key64 = ( const U64 * ) key ;
U64 const ll1 = XXH_readLE64 ( data ) ;
U64 const ll2 = XXH_readLE64 ( ( const BYTE * ) data + 8 ) ;
return XXH3_mul128_fold64 (
ll1 ^ ( XXH3_readKey64 ( key64 ) + seed64 ) ,
ll2 ^ ( XXH3_readKey64 ( key64 + 1 ) - seed64 ) ) ;
}
2019-03-12 21:21:24 +00:00
/* === Public entry point === */
2019-02-26 20:36:23 +00:00
2019-03-06 22:46:42 +00:00
XXH_PUBLIC_API XXH64_hash_t
2019-03-11 22:09:27 +00:00
XXH3_64bits_withSeed ( const void * data , size_t len , XXH64_hash_t seed )
2019-02-26 20:36:23 +00:00
{
2019-03-06 16:55:48 +00:00
const BYTE * const p = ( const BYTE * ) data ;
2019-03-13 00:48:59 +00:00
const char * const key = ( const char * ) kKey ;
2019-03-06 16:55:48 +00:00
2019-03-06 22:46:42 +00:00
if ( len < = 16 ) return XXH3_len_0to16_64b ( data , len , seed ) ;
2019-03-06 16:55:48 +00:00
2019-03-25 02:08:54 +00:00
{ U64 acc = len * PRIME64_1 ;
2019-03-06 16:55:48 +00:00
if ( len > 32 ) {
if ( len > 64 ) {
if ( len > 96 ) {
2019-03-08 20:37:06 +00:00
if ( len > 128 ) return XXH3_hashLong_64b ( data , len , seed ) ;
2019-03-06 16:55:48 +00:00
2019-03-17 04:27:39 +00:00
acc + = XXH3_mix16B ( p + 48 , key + 96 , seed ) ;
acc + = XXH3_mix16B ( p + len - 64 , key + 112 , seed ) ;
2019-03-06 16:55:48 +00:00
}
2019-03-17 04:27:39 +00:00
acc + = XXH3_mix16B ( p + 32 , key + 64 , seed ) ;
acc + = XXH3_mix16B ( p + len - 48 , key + 80 , seed ) ;
2019-03-06 16:55:48 +00:00
}
2019-03-17 04:27:39 +00:00
acc + = XXH3_mix16B ( p + 16 , key + 32 , seed ) ;
acc + = XXH3_mix16B ( p + len - 32 , key + 48 , seed ) ;
2019-03-06 16:55:48 +00:00
}
2019-03-17 04:27:39 +00:00
acc + = XXH3_mix16B ( p + 0 , key + 0 , seed ) ;
acc + = XXH3_mix16B ( p + len - 16 , key + 16 , seed ) ;
2019-03-06 16:55:48 +00:00
2019-03-12 21:21:24 +00:00
return XXH3_avalanche ( acc ) ;
2019-02-26 20:36:23 +00:00
}
}
2019-02-26 21:45:56 +00:00
2019-03-11 22:09:27 +00:00
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits ( const void * data , size_t len )
2019-03-06 22:46:42 +00:00
{
2019-03-11 22:09:27 +00:00
return XXH3_64bits_withSeed ( data , len , 0 ) ;
2019-03-06 22:46:42 +00:00
}
2019-02-26 21:45:56 +00:00
2019-04-24 23:25:10 +00:00
/* =========================================================
2019-03-08 20:37:06 +00:00
* XXH3 128 bits ( = > XXH128 )
2019-04-24 23:25:10 +00:00
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
2019-03-07 04:42:04 +00:00
XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_1to3_128b ( const void * data , size_t len , const void * keyPtr , XXH64_hash_t seed )
{
assert ( data ! = NULL ) ;
assert ( len > 0 & & len < = 3 ) ;
assert ( keyPtr ! = NULL ) ;
2019-04-24 23:25:10 +00:00
{ BYTE const c1 = ( ( const BYTE * ) data ) [ 0 ] ;
2019-03-07 04:42:04 +00:00
BYTE const c2 = ( ( const BYTE * ) data ) [ len > > 1 ] ;
BYTE const c3 = ( ( const BYTE * ) data ) [ len - 1 ] ;
2019-04-24 23:25:10 +00:00
U32 const combined = ( ( U32 ) c1 ) + ( ( ( U32 ) c2 ) < < 8 ) + ( ( ( U32 ) c3 ) < < 16 ) + ( ( ( U32 ) len ) < < 24 ) ;
U64 const keyed = ( U64 ) combined ^ ( XXH3_readKey64 ( keyPtr ) + seed ) ;
U64 const low64 = keyed * PRIME64_1 ;
U64 const high64 = keyed * PRIME64_2 ;
XXH128_hash_t const h128 = { XXH3_avalanche ( low64 ) , XXH3_avalanche ( high64 ) } ;
2019-03-16 13:59:46 +00:00
return h128 ;
2019-03-07 04:42:04 +00:00
}
}
XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_4to8_128b ( const void * data , size_t len , const void * keyPtr , XXH64_hash_t seed )
{
assert ( data ! = NULL ) ;
2019-04-24 23:25:10 +00:00
assert ( key ! = NULL ) ;
2019-03-07 04:42:04 +00:00
assert ( len > = 4 & & len < = 8 ) ;
2019-04-24 23:25:10 +00:00
{ U32 const in1 = XXH_readLE32 ( data ) ;
U32 const in2 = XXH_readLE32 ( ( const BYTE * ) data + len - 4 ) ;
U64 const in64 = in1 + ( ( ( U64 ) in2 ) < < 32 ) ;
U64 const keyed = in64 ^ ( XXH3_readKey64 ( keyPtr ) + seed ) ;
U64 const low64 = XXH3_mul128_fold64 ( keyed , PRIME64_1 ) + len ;
U64 const high64 = XXH3_mul128_fold64 ( keyed , PRIME64_2 ) - len ;
{ XXH128_hash_t const h128 = { XXH3_avalanche ( low64 ) , XXH3_avalanche ( high64 ) } ;
2019-03-16 13:59:46 +00:00
return h128 ;
}
2019-03-07 04:42:04 +00:00
}
}
2019-04-24 23:25:10 +00:00
2019-03-07 04:42:04 +00:00
XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_9to16_128b ( const void * data , size_t len , const void * keyPtr , XXH64_hash_t seed )
{
assert ( data ! = NULL ) ;
assert ( key ! = NULL ) ;
assert ( len > = 9 & & len < = 16 ) ;
{ const U64 * const key64 = ( const U64 * ) keyPtr ;
2019-04-24 23:25:10 +00:00
U64 const ll1 = XXH_readLE64 ( data ) ^ ( XXH3_readKey64 ( key64 ) + seed ) ;
U64 const ll2 = XXH_readLE64 ( ( const BYTE * ) data + len - 8 ) ^ ( XXH3_readKey64 ( key64 + 1 ) - seed ) ;
U64 const mix1 = len + ( ll1 + ll2 ) + XXH3_mul128_fold64 ( ll1 , ll2 ) ;
U64 const mix2 = XXH3_mul128_fold64 ( ll1 , PRIME64_1 )
+ XXH3_mul128_fold64 ( ll2 , PRIME64_2 )
- len ;
{ XXH128_hash_t const h128 = { XXH3_avalanche ( mix1 ) , XXH3_avalanche ( mix2 ) } ;
2019-03-16 13:59:46 +00:00
return h128 ;
}
2019-03-07 04:42:04 +00:00
}
}
2019-04-24 23:25:10 +00:00
2019-03-07 04:42:04 +00:00
XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_0to16_128b ( const void * data , size_t len , XXH64_hash_t seed )
{
assert ( data ! = NULL ) ;
assert ( len < = 16 ) ;
{ if ( len > 8 ) return XXH3_len_9to16_128b ( data , len , kKey , seed ) ;
if ( len > = 4 ) return XXH3_len_4to8_128b ( data , len , kKey , seed ) ;
if ( len ) return XXH3_len_1to3_128b ( data , len , kKey , seed ) ;
2019-03-16 16:35:10 +00:00
{ XXH128_hash_t const h128 = { seed , ( XXH64_hash_t ) 0 - seed } ;
2019-03-16 13:59:46 +00:00
return h128 ;
}
2019-03-07 04:42:04 +00:00
}
}
2019-04-25 00:38:07 +00:00
XXH_FORCE_INLINE void
2019-04-30 00:44:45 +00:00
XXH3_accumulate128_512bits ( void * restrict acc ,
const void * restrict data ,
const void * restrict key ,
U32 mul1 , U32 mul2 )
2019-04-25 00:38:07 +00:00
{
2019-04-30 00:44:45 +00:00
( void ) mul1 ; ( void ) mul2 ;
2019-04-25 16:55:10 +00:00
2019-04-30 00:44:45 +00:00
# if (XXH_VECTOR == XXH_AVX2)
2019-04-25 16:55:10 +00:00
2019-04-30 00:44:45 +00:00
// merge, then mix, then interleave
2019-04-25 16:55:10 +00:00
2019-04-30 00:44:45 +00:00
assert ( ( ( size_t ) acc ) & 31 = = 0 ) ;
assert ( mul1 & 1 = = 0 ) ;
assert ( mul2 & 1 = = 0 ) ;
{ ALIGN ( 32 ) __m256i * const xacc = ( __m256i * ) acc ;
const __m256i * const xdata = ( const __m256i * ) data ;
const __m256i * const xkey = ( const __m256i * ) key ;
2019-04-25 16:55:10 +00:00
2019-04-30 00:44:45 +00:00
const __m256i k1 = _mm256_set1_epi32 ( ( int ) ( mul1 ) ) ;
const __m256i k2 = _mm256_set1_epi32 ( ( int ) ( mul2 ) ) ;
2019-04-25 16:55:10 +00:00
2019-04-30 00:44:45 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m256i ) ; i + + ) {
// ingest and merge
__m256i const d = _mm256_loadu_si256 ( xdata + i ) ;
__m256i const k = _mm256_loadu_si256 ( xkey + i ) ;
__m256i const dk = _mm256_xor_si256 ( d , k ) ;
__m256i Vacc = _mm256_add_epi64 ( xacc [ i ] , dk ) ;
// mix 1
{ __m256i const shifted = _mm256_srli_epi64 ( Vacc , 32 ) ;
__m256i const xored = _mm256_xor_si256 ( Vacc , shifted ) ;
__m256i const mul = _mm256_mul_epu32 ( xored , k1 ) ;
Vacc = _mm256_add_epi64 ( xored , mul ) ;
2019-04-25 16:55:10 +00:00
}
2019-04-30 00:44:45 +00:00
// shuffle, merge, and mix 2 // ~22.2 GB/s
{ __m256i const shuffle = _mm256_shuffle_epi32 ( dk , _MM_SHUFFLE ( 0 , 1 , 2 , 3 ) ) ;
__m256i const xored = _mm256_xor_si256 ( Vacc , shuffle ) ;
__m256i const mul = _mm256_mul_epu32 ( shuffle , k2 ) ;
Vacc = _mm256_add_epi64 ( xored , mul ) ;
2019-04-25 16:55:10 +00:00
}
xacc [ i ] = Vacc ;
}
}
2019-04-30 00:44:45 +00:00
# elif (XXH_VECTOR == XXH_SSE2)
2019-04-25 17:41:15 +00:00
2019-04-30 00:44:45 +00:00
// merge, then mix, then interleave
2019-04-25 17:41:15 +00:00
2019-04-30 00:44:45 +00:00
assert ( ( ( size_t ) acc ) & 15 = = 0 ) ;
{ ALIGN ( 16 ) __m128i * const xacc = ( __m128i * ) acc ;
const __m128i * const xdata = ( const __m128i * ) data ;
const __m128i * const xkey = ( const __m128i * ) key ;
2019-04-25 17:41:15 +00:00
2019-04-30 00:44:45 +00:00
const __m128i k1 = _mm_set1_epi32 ( ( int ) ( mul1 ) ) ;
const __m128i k2 = _mm_set1_epi32 ( ( int ) ( mul2 ) ) ;
2019-04-25 17:41:15 +00:00
2019-04-30 00:44:45 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m128i ) ; i + + ) {
// ingest and merge
__m128i const d = _mm_loadu_si128 ( xdata + i ) ;
__m128i const k = _mm_loadu_si128 ( xkey + i ) ;
__m128i const dk = _mm_xor_si128 ( d , k ) ;
__m128i Vacc = _mm_add_epi64 ( xacc [ i ] , dk ) ;
// mix 1
{ __m128i const shifted = _mm_srli_epi64 ( Vacc , 32 ) ;
__m128i const xored = _mm_xor_si128 ( Vacc , shifted ) ;
__m128i const mul = _mm_mul_epu32 ( xored , k1 ) ;
Vacc = _mm_add_epi64 ( xored , mul ) ;
}
2019-04-25 17:41:15 +00:00
2019-04-30 00:44:45 +00:00
// shuffle, merge and mix 2
{ __m128i const shuffle = _mm_shuffle_epi32 ( dk , _MM_SHUFFLE ( 0 , 1 , 2 , 3 ) ) ;
__m128i const xored = _mm_xor_si128 ( Vacc , shuffle ) ;
__m128i const mul = _mm_mul_epu32 ( shuffle , k2 ) ;
Vacc = _mm_add_epi64 ( xored , mul ) ;
}
2019-04-25 17:41:15 +00:00
2019-04-30 00:44:45 +00:00
xacc [ i ] = Vacc ;
2019-04-25 17:41:15 +00:00
}
2019-04-25 16:55:10 +00:00
}
# else
2019-04-25 00:38:07 +00:00
/* scalar variant of Accumulator - universal */
2019-04-30 00:44:45 +00:00
/* merge, then mix, then interleave */
U64 * const xacc = ( U64 * ) acc ; /* presumed aligned */
const U64 * const xdata = ( const U64 * ) data ; /* not necessarily aligned */
const U64 * const xkey = ( const U64 * ) key ; /* presumed aligned */
2019-04-25 00:38:07 +00:00
int i ;
for ( i = 0 ; i < ( int ) ACC_NB ; i + = 2 ) {
int const left = i ;
int const right = i + 1 ;
U64 const dataLeft = XXH_readLE64 ( xdata + left ) ;
U64 const dataRight = XXH_readLE64 ( xdata + right ) ;
U64 const leftKeyed = dataLeft ^ xkey [ left ] ;
U64 const rightKeyed = dataRight ^ xkey [ right ] ;
2019-04-30 00:44:45 +00:00
/* merge */
U64 accLeft = xacc [ left ] + leftKeyed ;
U64 accRight = xacc [ right ] + rightKeyed ;
/* mix1 */
accLeft ^ = ( accLeft > > 32 ) ;
accRight ^ = ( accRight > > 32 ) ;
accLeft + = ( U64 ) ( mul1 ) * ( U32 ) accLeft ;
accRight + = ( U64 ) ( mul1 ) * ( U32 ) accRight ;
/* interleave */
/* note : this operation seems to disable/confuse clang's auto-vectorizer */
{ U64 const shuffleLeft = ( rightKeyed > > 32 ) + ( rightKeyed < < 32 ) ;
U64 const shuffleRight = ( leftKeyed > > 32 ) + ( leftKeyed < < 32 ) ;
accLeft ^ = shuffleLeft ;
accRight ^ = shuffleRight ;
accLeft + = ( U64 ) ( mul2 ) * ( U32 ) shuffleLeft ;
accRight + = ( U64 ) ( mul2 ) * ( U32 ) shuffleRight ;
}
xacc [ left ] = accLeft ;
xacc [ right ] = accRight ;
2019-04-25 00:38:07 +00:00
}
2019-04-25 16:55:10 +00:00
# endif /* vect arch */
2019-04-25 00:38:07 +00:00
}
2019-04-30 00:44:45 +00:00
static void XXH3_accumulate128 ( U64 * restrict acc ,
const void * restrict data , size_t nbStripes ,
const U32 * restrict key ,
const U32 * restrict mul )
2019-04-25 00:38:07 +00:00
{
size_t n ;
/* Clang doesn't unroll this loop without the pragma. Unrolling can be up to 1.4x faster. */
# if defined(__clang__) && !defined(__OPTIMIZE_SIZE__)
# pragma clang loop unroll(enable)
# endif
for ( n = 0 ; n < nbStripes ; n + + ) {
2019-04-30 00:44:45 +00:00
XXH3_accumulate128_512bits ( acc ,
( const BYTE * ) data + n * STRIPE_LEN ,
key + n * 2 ,
mul [ 2 * n ] , mul [ 2 * n + 1 ] ) ;
2019-04-25 00:38:07 +00:00
}
}
static void
2019-04-30 00:44:45 +00:00
XXH3_hashLong128 ( U64 * restrict acc ,
const void * restrict data , size_t len ,
const U32 * restrict mul )
2019-04-25 00:38:07 +00:00
{
# define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2)
size_t const block_len = STRIPE_LEN * NB_KEYS ;
size_t const nb_blocks = len / block_len ;
size_t n ;
for ( n = 0 ; n < nb_blocks ; n + + ) {
2019-04-30 00:44:45 +00:00
XXH3_accumulate128 ( acc ,
( const BYTE * ) data + n * block_len , NB_KEYS ,
kKey ,
mul ) ;
2019-04-25 00:38:07 +00:00
XXH3_scrambleAcc ( acc , kKey + ( KEYSET_DEFAULT_SIZE - STRIPE_ELTS ) ) ;
}
/* last partial block */
assert ( len > STRIPE_LEN ) ;
{ size_t const nbStripes = ( len % block_len ) / STRIPE_LEN ;
assert ( nbStripes < NB_KEYS ) ;
2019-04-30 00:44:45 +00:00
XXH3_accumulate128 ( acc ,
( const BYTE * ) data + nb_blocks * block_len , nbStripes ,
kKey ,
mul ) ;
2019-04-25 00:38:07 +00:00
/* last stripe */
if ( len & ( STRIPE_LEN - 1 ) ) {
const BYTE * const p = ( const BYTE * ) data + len - STRIPE_LEN ;
2019-04-30 00:44:45 +00:00
XXH3_accumulate128_512bits ( acc ,
p ,
kKey + nbStripes * 2 ,
PRIME32_4 - 1 , PRIME32_5 - 1 ) ;
2019-04-25 00:38:07 +00:00
} }
}
2019-04-24 23:25:10 +00:00
2019-03-16 13:59:46 +00:00
XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
2019-03-08 20:37:06 +00:00
XXH3_hashLong_128b ( const void * data , size_t len , XXH64_hash_t seed )
{
2019-03-16 16:35:10 +00:00
ALIGN ( 64 ) U64 acc [ ACC_NB ] = { seed , PRIME64_1 , PRIME64_2 , PRIME64_3 , PRIME64_4 , PRIME64_5 , ( U64 ) 0 - seed , 0 } ;
2019-04-30 00:44:45 +00:00
U32 key [ KEYSET_DEFAULT_SIZE ] ;
U32 mult [ NB_STRIPES_PER_ROUND * 2 ] ;
XXH3_initKeySeed ( key , seed ) ;
XXH3_initMultipliers ( mult , NB_STRIPES_PER_ROUND * 2 ) ;
2019-03-08 20:37:06 +00:00
assert ( len > 128 ) ;
2019-04-30 00:44:45 +00:00
XXH3_hashLong128 ( acc , data , len , mult ) ;
2019-03-08 20:37:06 +00:00
/* converge into final hash */
assert ( sizeof ( acc ) = = 64 ) ;
2019-03-14 20:08:38 +00:00
{ U64 const low64 = XXH3_mergeAccs ( acc , kKey , ( U64 ) len * PRIME64_1 ) ;
U64 const high64 = XXH3_mergeAccs ( acc , kKey + 16 , ( ( U64 ) len + 1 ) * PRIME64_2 ) ;
2019-03-16 13:59:46 +00:00
XXH128_hash_t const h128 = { low64 , high64 } ;
return h128 ;
2019-03-08 20:37:06 +00:00
}
}
2019-04-24 23:25:10 +00:00
XXH_FORCE_INLINE void XXH3_add16B ( XXH128_hash_t * xxh128 , const void * data , const void * key , U64 seed64 )
{
const U64 * const key64 = ( const U64 * ) key ;
U64 const ll1 = XXH_readLE64 ( data ) ^ ( XXH3_readKey64 ( key64 ) + seed64 ) ;
U64 const ll2 = XXH_readLE64 ( ( const BYTE * ) data + 8 ) ^ ( XXH3_readKey64 ( key64 + 1 ) - seed64 ) ;
U64 const mix1 = XXH3_mul128_fold64 ( ll1 , PRIME64_1 ) + XXH3_mul128_fold64 ( ll2 , PRIME64_3 ) ;
U64 const mix2 = XXH3_mul128_fold64 ( ll1 , PRIME64_4 ) + XXH3_mul128_fold64 ( ll2 , PRIME64_2 ) ;
xxh128 - > low64 ^ = mix1 ;
xxh128 - > high64 ^ = mix2 ;
}
2019-03-07 04:42:04 +00:00
XXH_PUBLIC_API XXH128_hash_t
2019-03-11 22:09:27 +00:00
XXH3_128bits_withSeed ( const void * data , size_t len , XXH64_hash_t seed )
2019-03-07 04:42:04 +00:00
{
if ( len < = 16 ) return XXH3_len_0to16_128b ( data , len , seed ) ;
2019-04-24 23:25:10 +00:00
{ XXH128_hash_t xxh128 = { PRIME64_2 * len , - PRIME64_1 * len } ;
2019-03-07 04:42:04 +00:00
const BYTE * const p = ( const BYTE * ) data ;
2019-03-13 00:48:59 +00:00
const char * const key = ( const char * ) kKey ;
2019-03-07 04:42:04 +00:00
if ( len > 32 ) {
if ( len > 64 ) {
if ( len > 96 ) {
2019-03-08 20:37:06 +00:00
if ( len > 128 ) return XXH3_hashLong_128b ( data , len , seed ) ;
2019-03-07 04:42:04 +00:00
2019-04-24 23:25:10 +00:00
XXH3_add16B ( & xxh128 , p + 48 , key + 96 , seed ) ;
XXH3_add16B ( & xxh128 , p + len - 64 , key + 112 , seed ) ;
2019-03-07 04:42:04 +00:00
}
2019-04-24 23:25:10 +00:00
XXH3_add16B ( & xxh128 , p + 32 , key + 64 , seed ) ;
XXH3_add16B ( & xxh128 , p + len - 48 , key + 80 , seed ) ;
2019-03-07 04:42:04 +00:00
}
2019-04-24 23:25:10 +00:00
XXH3_add16B ( & xxh128 , p + 16 , key + 32 , seed ) ;
XXH3_add16B ( & xxh128 , p + len - 32 , key + 48 , seed ) ;
2019-03-07 04:42:04 +00:00
}
2019-04-24 23:25:10 +00:00
XXH3_add16B ( & xxh128 , p + 0 , key + 0 , seed ) ;
XXH3_add16B ( & xxh128 , p + len - 16 , key + 16 , seed ) ;
2019-03-07 04:42:04 +00:00
2019-04-24 23:25:10 +00:00
xxh128 . low64 = XXH3_avalanche ( xxh128 . low64 ) ;
xxh128 . high64 = XXH3_avalanche ( xxh128 . high64 ) ;
return xxh128 ;
2019-03-07 04:42:04 +00:00
}
}
2019-03-11 22:09:27 +00:00
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits ( const void * data , size_t len )
2019-03-07 04:42:04 +00:00
{
2019-03-11 22:09:27 +00:00
return XXH3_128bits_withSeed ( data , len , 0 ) ;
2019-03-07 04:42:04 +00:00
}
XXH_PUBLIC_API XXH128_hash_t XXH128 ( const void * data , size_t len , XXH64_hash_t seed )
{
2019-03-11 22:09:27 +00:00
return XXH3_128bits_withSeed ( data , len , seed ) ;
2019-03-07 04:42:04 +00:00
}
2019-02-26 20:36:23 +00:00
# endif /* XXH3_H */