2019-03-12 18:44:44 +00:00
/*
xxHash - Extremely Fast Hash algorithm
Development source file for ` xxh3 `
Copyright ( C ) 2019 - present , Yann Collet .
BSD 2 - Clause License ( http : //www.opensource.org/licenses/bsd-license.php)
Redistribution and use in source and binary forms , with or without
modification , are permitted provided that the following conditions are
met :
* Redistributions of source code must retain the above copyright
notice , this list of conditions and the following disclaimer .
* Redistributions in binary form must reproduce the above
copyright notice , this list of conditions and the following disclaimer
in the documentation and / or other materials provided with the
distribution .
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
" AS IS " AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT
LIMITED TO , THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT , INDIRECT , INCIDENTAL ,
SPECIAL , EXEMPLARY , OR CONSEQUENTIAL DAMAGES ( INCLUDING , BUT NOT
LIMITED TO , PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE ,
DATA , OR PROFITS ; OR BUSINESS INTERRUPTION ) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY , WHETHER IN CONTRACT , STRICT LIABILITY , OR TORT
( INCLUDING NEGLIGENCE OR OTHERWISE ) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE , EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE .
You can contact the author at :
- xxHash source repository : https : //github.com/Cyan4973/xxHash
*/
/* Note :
This file is separated for development purposes .
It will be integrated into ` xxhash . c ` when development phase is complete .
*/
2019-02-26 20:36:23 +00:00
# ifndef XXH3_H
# define XXH3_H
2019-02-26 23:24:59 +00:00
/* === Dependencies === */
2019-02-26 21:45:56 +00:00
# undef XXH_INLINE_ALL /* in case it's already defined */
2019-02-26 20:36:23 +00:00
# define XXH_INLINE_ALL
# include "xxhash.h"
2019-05-05 03:10:52 +00:00
/* === Compiler specifics === */
2019-02-26 23:24:59 +00:00
2019-07-09 22:39:04 +00:00
# if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
# define XXH_RESTRICT restrict
# else
/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
# define XXH_RESTRICT /* disable */
2019-02-26 23:24:59 +00:00
# endif
2019-02-27 00:36:03 +00:00
# if defined(__GNUC__)
2019-05-08 21:21:52 +00:00
# if defined(__AVX2__)
# include <immintrin.h>
# elif defined(__SSE2__)
# include <emmintrin.h>
2019-03-01 01:28:29 +00:00
# elif defined(__ARM_NEON__) || defined(__ARM_NEON)
2019-05-05 03:10:52 +00:00
# define inline __inline__ /* clang bug */
2019-03-01 01:28:29 +00:00
# include <arm_neon.h>
# undef inline
2019-02-27 00:36:03 +00:00
# endif
# elif defined(_MSC_VER)
# include <intrin.h>
# endif
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
/*
* Sanity check .
*
* XXH3 only requires these features to be efficient :
*
* - Usable unaligned access
* - A 32 - bit or 64 - bit ALU
* - If 32 - bit , a decent ADC instruction
* - A 32 or 64 - bit multiply with a 64 - bit result
*
* Almost all 32 - bit and 64 - bit targets meet this , except for Thumb - 1 , the
* classic 16 - bit only subset of ARM ' s instruction set .
*
* First of all , Thumb - 1 lacks support for the UMULL instruction which
* performs the important long multiply . This means numerous __aeabi_lmul
* calls .
*
* Second of all , the 8 functional registers are just not enough .
* Setup for __aeabi_lmul , byteshift loads , pointers , and all arithmetic need
* Lo registers , and this shuffling results in thousands more MOVs than A32 .
*
* A32 and T32 don ' t have this limitation . They can access all 14 registers ,
* do a 32 - > 64 multiply with UMULL , and the flexible operand is helpful too .
*
* If compiling Thumb - 1 for a target which supports ARM instructions , we
* will give a warning .
*
* Usually , if this happens , it is because of an accident and you probably
* need to specify - march , as you probably meant to compileh for a newer
* architecture .
*/
# if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
# warning "XXH3 is highly inefficient without ARM or Thumb-2."
# endif
2019-02-26 23:24:59 +00:00
2019-02-26 21:45:56 +00:00
/* ==========================================
* Vectorization detection
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
2019-02-26 20:36:23 +00:00
# define XXH_SCALAR 0
# define XXH_SSE2 1
# define XXH_AVX2 2
2019-03-01 01:28:29 +00:00
# define XXH_NEON 3
2019-04-26 19:56:26 +00:00
# define XXH_VSX 4
2019-02-26 20:36:23 +00:00
# ifndef XXH_VECTOR /* can be defined on command line */
# if defined(__AVX2__)
# define XXH_VECTOR XXH_AVX2
2019-06-10 06:27:37 +00:00
# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
2019-02-26 20:36:23 +00:00
# define XXH_VECTOR XXH_SSE2
2019-06-07 21:11:25 +00:00
# elif defined(__GNUC__) /* msvc support maybe later */ \
2019-03-19 19:51:29 +00:00
& & ( defined ( __ARM_NEON__ ) | | defined ( __ARM_NEON ) ) \
& & defined ( __LITTLE_ENDIAN__ ) /* ARM big endian is a thing */
2019-03-01 01:28:29 +00:00
# define XXH_VECTOR XXH_NEON
2019-08-21 01:06:11 +00:00
# elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
2019-04-26 19:56:26 +00:00
# define XXH_VECTOR XXH_VSX
2019-02-26 20:36:23 +00:00
# else
# define XXH_VECTOR XXH_SCALAR
# endif
# endif
2019-06-13 01:09:04 +00:00
/* control alignment of accumulator,
* for compatibility with fast vector loads */
# ifndef XXH_ACC_ALIGN
# if XXH_VECTOR == 0 /* scalar */
# define XXH_ACC_ALIGN 8
# elif XXH_VECTOR == 1 /* sse2 */
# define XXH_ACC_ALIGN 16
# elif XXH_VECTOR == 2 /* avx2 */
# define XXH_ACC_ALIGN 32
# elif XXH_VECTOR == 3 /* neon */
# define XXH_ACC_ALIGN 16
# elif XXH_VECTOR == 4 /* vsx */
# define XXH_ACC_ALIGN 16
# endif
# endif
2019-03-07 22:26:49 +00:00
/* U64 XXH_mult32to64(U32 a, U64 b) { return (U64)a * (U64)b; } */
2019-07-03 22:04:03 +00:00
# if defined(_MSC_VER) && defined(_M_IX86)
# include <intrin.h>
2019-07-09 22:39:04 +00:00
# define XXH_mult32to64(x, y) __emulu(x, y)
2019-03-07 22:26:49 +00:00
# else
2019-07-03 22:04:03 +00:00
# define XXH_mult32to64(x, y) ((U64)((x) & 0xFFFFFFFF) * (U64)((y) & 0xFFFFFFFF))
2019-03-07 22:26:49 +00:00
# endif
2019-02-26 23:24:59 +00:00
2019-08-21 01:06:11 +00:00
/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
* there is a lot of mischief with endianness . */
2019-04-26 19:56:26 +00:00
# if XXH_VECTOR == XXH_VSX
# include <altivec.h>
# undef vector
typedef __vector unsigned long long U64x2 ;
2019-08-21 01:06:11 +00:00
typedef __vector unsigned char U8x16 ;
2019-04-26 19:56:26 +00:00
typedef __vector unsigned U32x4 ;
2019-08-21 01:06:11 +00:00
# ifndef XXH_VSX_BE
# ifdef __BIG_ENDIAN__
# define XXH_VSX_BE 1
# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
# warning "-maltivec=be is not recommended. Please use native endianness."
# define XXH_VSX_BE 1
# else
# define XXH_VSX_BE 0
# endif
# endif
/* We need some helpers for big endian mode. */
# if XXH_VSX_BE
/* A wrapper for POWER9's vec_revb. */
# ifdef __POWER9_VECTOR__
# define XXH_vec_revb vec_revb
# else
XXH_FORCE_INLINE U64x2 XXH_vec_revb ( U64x2 val )
{
U8x16 const vByteSwap = { 0x07 , 0x06 , 0x05 , 0x04 , 0x03 , 0x02 , 0x01 , 0x00 ,
0x0F , 0x0E , 0x0D , 0x0C , 0x0B , 0x0A , 0x09 , 0x08 } ;
return vec_perm ( val , val , vByteSwap ) ;
}
# endif
/* Power8 Crypto gives us vpermxor which is very handy for
* PPC64EB .
*
* U8x16 vpermxor ( U8x16 a , U8x16 b , U8x16 mask )
* {
* U8x16 ret ;
2019-09-16 14:10:46 +00:00
* for ( int i = 0 ; i < 16 ; i + + ) {
2019-08-21 01:06:11 +00:00
* ret [ i ] = a [ mask [ i ] & 0xF ] ^ b [ mask [ i ] > > 4 ] ;
* }
* return ret ;
* }
*
2019-10-02 16:28:01 +00:00
* Because both of the main loops load the key , swap , and xor it with input ,
2019-08-21 01:06:11 +00:00
* we can combine the key swap into this instruction .
*/
# ifdef vec_permxor
# define XXH_vec_permxor vec_permxor
# else
# define XXH_vec_permxor __builtin_crypto_vpermxor
# endif
# endif
/*
* Because we reinterpret the multiply , there are endian memes : vec_mulo actually becomes
* vec_mule .
*
* Additionally , the intrinsic wasn ' t added until GCC 8 , despite existing for a while .
* Clang has an easy way to control this , we can just use the builtin which doesn ' t swap .
* GCC needs inline assembly . */
# if __has_builtin(__builtin_altivec_vmuleuw)
# define XXH_vec_mulo __builtin_altivec_vmulouw
# define XXH_vec_mule __builtin_altivec_vmuleuw
# else
2019-04-26 19:56:26 +00:00
/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
2019-08-21 01:06:11 +00:00
XXH_FORCE_INLINE U64x2 XXH_vec_mulo ( U32x4 a , U32x4 b ) {
2019-04-26 19:56:26 +00:00
U64x2 result ;
__asm__ ( " vmulouw %0, %1, %2 " : " =v " ( result ) : " v " ( a ) , " v " ( b ) ) ;
return result ;
}
2019-08-21 01:06:11 +00:00
XXH_FORCE_INLINE U64x2 XXH_vec_mule ( U32x4 a , U32x4 b ) {
2019-04-26 19:56:26 +00:00
U64x2 result ;
__asm__ ( " vmuleuw %0, %1, %2 " : " =v " ( result ) : " v " ( a ) , " v " ( b ) ) ;
return result ;
}
# endif
2019-08-21 01:06:11 +00:00
# endif
2019-04-26 19:56:26 +00:00
2019-03-06 16:55:48 +00:00
2019-02-26 21:45:56 +00:00
/* ==========================================
2019-03-06 16:55:48 +00:00
* XXH3 default settings
2019-02-26 21:45:56 +00:00
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
2019-02-26 20:36:23 +00:00
2019-06-17 21:16:52 +00:00
# define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
2019-02-26 20:36:23 +00:00
2019-06-17 21:16:52 +00:00
# if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
2019-06-11 17:51:09 +00:00
# error "default keyset is not large enough"
# endif
2019-02-26 20:36:23 +00:00
2019-06-12 20:33:31 +00:00
XXH_ALIGN ( 64 ) static const BYTE kSecret [ XXH_SECRET_DEFAULT_SIZE ] = {
2019-06-12 18:33:40 +00:00
0xb8 , 0xfe , 0x6c , 0x39 , 0x23 , 0xa4 , 0x4b , 0xbe , 0x7c , 0x01 , 0x81 , 0x2c , 0xf7 , 0x21 , 0xad , 0x1c ,
0xde , 0xd4 , 0x6d , 0xe9 , 0x83 , 0x90 , 0x97 , 0xdb , 0x72 , 0x40 , 0xa4 , 0xa4 , 0xb7 , 0xb3 , 0x67 , 0x1f ,
0xcb , 0x79 , 0xe6 , 0x4e , 0xcc , 0xc0 , 0xe5 , 0x78 , 0x82 , 0x5a , 0xd0 , 0x7d , 0xcc , 0xff , 0x72 , 0x21 ,
0xb8 , 0x08 , 0x46 , 0x74 , 0xf7 , 0x43 , 0x24 , 0x8e , 0xe0 , 0x35 , 0x90 , 0xe6 , 0x81 , 0x3a , 0x26 , 0x4c ,
0x3c , 0x28 , 0x52 , 0xbb , 0x91 , 0xc3 , 0x00 , 0xcb , 0x88 , 0xd0 , 0x65 , 0x8b , 0x1b , 0x53 , 0x2e , 0xa3 ,
0x71 , 0x64 , 0x48 , 0x97 , 0xa2 , 0x0d , 0xf9 , 0x4e , 0x38 , 0x19 , 0xef , 0x46 , 0xa9 , 0xde , 0xac , 0xd8 ,
0xa8 , 0xfa , 0x76 , 0x3f , 0xe3 , 0x9c , 0x34 , 0x3f , 0xf9 , 0xdc , 0xbb , 0xc7 , 0xc7 , 0x0b , 0x4f , 0x1d ,
0x8a , 0x51 , 0xe0 , 0x4b , 0xcd , 0xb4 , 0x59 , 0x31 , 0xc8 , 0x9f , 0x7e , 0xc9 , 0xd9 , 0x78 , 0x73 , 0x64 ,
0xea , 0xc5 , 0xac , 0x83 , 0x34 , 0xd3 , 0xeb , 0xc3 , 0xc5 , 0x81 , 0xa0 , 0xff , 0xfa , 0x13 , 0x63 , 0xeb ,
0x17 , 0x0d , 0xdd , 0x51 , 0xb7 , 0xf0 , 0xda , 0x49 , 0xd3 , 0x16 , 0x55 , 0x26 , 0x29 , 0xd4 , 0x68 , 0x9e ,
0x2b , 0x16 , 0xbe , 0x58 , 0x7d , 0x47 , 0xa1 , 0xfc , 0x8f , 0xf8 , 0xb8 , 0xd1 , 0x7a , 0xd0 , 0x31 , 0xce ,
0x45 , 0xcb , 0x3a , 0x8f , 0x95 , 0x16 , 0x04 , 0x28 , 0xaf , 0xd7 , 0xfb , 0xca , 0xbb , 0x4b , 0x40 , 0x7e ,
2019-03-06 16:55:48 +00:00
} ;
2019-02-26 20:36:23 +00:00
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
/*
* GCC for x86 has a tendency to use SSE in this loop . While it
* successfully avoids swapping ( as MUL overwrites EAX and EDX ) , it
* slows it down because instead of free register swap shifts , it
* must use pshufd and punpckl / hd .
*
* To prevent this , we use this attribute to shut off SSE .
*/
# if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
__attribute__ ( ( __target__ ( " no-sse " ) ) )
# endif
2019-07-19 23:21:17 +00:00
static XXH128_hash_t
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
XXH_mult64to128 ( U64 lhs , U64 rhs )
2019-07-19 23:21:17 +00:00
{
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
/*
* GCC / Clang __uint128_t method .
*
* On most 64 - bit targets , GCC and Clang define a __uint128_t type .
* This is usually the best way as it usually uses a native long 64 - bit
* multiply , such as MULQ on x86_64 or MUL + UMULH on aarch64 .
*
* Usually .
*
* Despite being a 32 - bit platform , Clang ( and emscripten ) define this
* type despite not having the arithmetic for it . This results in a
* laggy compiler builtin call which calculates a full 128 - bit multiply .
* In that case it is best to use the portable one .
* https : //github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
*/
# if defined(__GNUC__) && !defined(__wasm__) \
& & defined ( __SIZEOF_INT128__ ) \
| | ( defined ( _INTEGRAL_MAX_BITS ) & & _INTEGRAL_MAX_BITS > = 128 )
2019-07-19 23:21:17 +00:00
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
__uint128_t product = ( __uint128_t ) lhs * ( __uint128_t ) rhs ;
XXH128_hash_t const r128 = { ( U64 ) ( product ) , ( U64 ) ( product > > 64 ) } ;
2019-07-19 23:21:17 +00:00
return r128 ;
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
/*
* MSVC for x64 ' s _umul128 method .
*
* U64 _umul128 ( U64 Multiplier , U64 Multiplicand , U64 * HighProduct ) ;
*
* This compiles to single operand MUL on x64 .
*/
2019-07-19 23:21:17 +00:00
# elif defined(_M_X64) || defined(_M_IA64)
# ifndef _MSC_VER
# pragma intrinsic(_umul128)
# endif
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
U64 product_high ;
U64 const product_low = _umul128 ( lhs , rhs , & product_high ) ;
XXH128_hash_t const r128 = { product_low , product_high } ;
2019-07-19 23:21:17 +00:00
return r128 ;
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
# else
/*
* Portable scalar method . Optimized for 32 - bit and 64 - bit ALUs .
*
* This is a fast and simple grade school multiply , which is shown
* below with base 10 arithmetic instead of base 0x100000000 .
*
* 9 3 // D2 lhs = 93
* x 7 5 // D2 rhs = 75
* - - - - - - - - - -
* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
* - - - - - - - - -
* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21
* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63
* - - - - - - - - -
* 6 9 7 5
*
* The reasons for adding the products like this are :
* 1. It avoids manual carry tracking . Just like how
* ( 9 * 9 ) + 9 + 9 = 99 , the same applies with this for
* UINT64_MAX . This avoids a lot of complexity .
*
* 2. It hints for , and on Clang , compiles to , the powerful UMAAL
* instruction available in ARMv6 + A32 / T32 , which is shown below :
*
* void UMAAL ( U32 * RdLo , U32 * RdHi , U32 Rn , U32 Rm )
* {
* U64 product = ( U64 ) * RdLo * ( U64 ) * RdHi + Rn + Rm ;
* * RdLo = ( U32 ) ( product & 0xFFFFFFFF ) ;
* * RdHi = ( U32 ) ( product > > 32 ) ;
* }
*
* This instruction was designed for efficient long multiplication ,
* and allows this to be calculated in only 4 instructions which
* is comparable to some 64 - bit ALUs .
*
* 3. It isn ' t terrible on other platforms . Usually this will be
* a couple of 32 - bit ADD / ADCs .
*/
2019-07-19 23:21:17 +00:00
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
/* First calculate all of the cross products. */
U64 const lo_lo = XXH_mult32to64 ( lhs & 0xFFFFFFFF , rhs & 0xFFFFFFFF ) ;
U64 const hi_lo = XXH_mult32to64 ( lhs > > 32 , rhs & 0xFFFFFFFF ) ;
U64 const lo_hi = XXH_mult32to64 ( lhs & 0xFFFFFFFF , rhs > > 32 ) ;
U64 const hi_hi = XXH_mult32to64 ( lhs > > 32 , rhs > > 32 ) ;
2019-07-19 23:21:17 +00:00
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
/* Now add the products together. These will never overflow. */
U64 const cross = ( lo_lo > > 32 ) + ( hi_lo & 0xFFFFFFFF ) + lo_hi ;
U64 const upper = ( hi_lo > > 32 ) + ( cross > > 32 ) + hi_hi ;
U64 const lower = ( cross < < 32 ) | ( lo_lo & 0xFFFFFFFF ) ;
2019-07-19 23:21:17 +00:00
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
XXH128_hash_t r128 = { lower , upper } ;
2019-07-19 23:21:17 +00:00
return r128 ;
# endif
}
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
/*
* We want to keep the attribute here because a target switch
* disables inlining .
*
* Does a 64 - bit to 128 - bit multiply , then XOR folds it .
* The reason for the separate function is to prevent passing
* too many structs around by value . This will hopefully inline
* the multiply , but we don ' t force it .
*/
# if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
2019-03-07 22:26:49 +00:00
__attribute__ ( ( __target__ ( " no-sse " ) ) )
# endif
static U64
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
XXH3_mul128_fold64 ( U64 lhs , U64 rhs )
2019-02-26 20:36:23 +00:00
{
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
XXH128_hash_t product = XXH_mult64to128 ( lhs , rhs ) ;
return product . low64 ^ product . high64 ;
2019-02-26 20:36:23 +00:00
}
2019-03-07 04:42:04 +00:00
2019-03-12 21:21:24 +00:00
static XXH64_hash_t XXH3_avalanche ( U64 h64 )
2019-03-06 16:55:48 +00:00
{
2019-03-23 19:26:16 +00:00
h64 ^ = h64 > > 37 ;
2019-03-06 16:55:48 +00:00
h64 * = PRIME64_3 ;
h64 ^ = h64 > > 32 ;
return h64 ;
}
2019-02-26 20:36:23 +00:00
2019-03-06 22:46:42 +00:00
2019-03-06 16:55:48 +00:00
/* ==========================================
* Short keys
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
2019-04-24 23:25:10 +00:00
2019-03-06 22:46:42 +00:00
XXH_FORCE_INLINE XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_1to3_64b ( const BYTE * input , size_t len , const BYTE * secret , XXH64_hash_t seed )
2019-02-26 20:36:23 +00:00
{
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( input ! = NULL ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( 1 < = len & & len < = 3 ) ;
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( secret ! = NULL ) ;
{ BYTE const c1 = input [ 0 ] ;
BYTE const c2 = input [ len > > 1 ] ;
BYTE const c3 = input [ len - 1 ] ;
U32 const combined = ( ( U32 ) c1 ) | ( ( ( U32 ) c2 ) < < 8 ) | ( ( ( U32 ) c3 ) < < 16 ) | ( ( ( U32 ) len ) < < 24 ) ;
U64 const keyed = ( U64 ) combined ^ ( XXH_readLE32 ( secret ) + seed ) ;
2019-04-24 23:25:10 +00:00
U64 const mixed = keyed * PRIME64_1 ;
return XXH3_avalanche ( mixed ) ;
2019-02-26 21:45:56 +00:00
}
2019-02-26 20:36:23 +00:00
}
2019-03-27 19:35:02 +00:00
XXH_FORCE_INLINE XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_4to8_64b ( const BYTE * input , size_t len , const BYTE * secret , XXH64_hash_t seed )
2019-03-27 19:35:02 +00:00
{
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( input ! = NULL ) ;
XXH_ASSERT ( secret ! = NULL ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( 4 < = len & & len < = 8 ) ;
2019-10-02 16:28:01 +00:00
{ U32 const input_lo = XXH_readLE32 ( input ) ;
U32 const input_hi = XXH_readLE32 ( input + len - 4 ) ;
U64 const input_64 = input_lo | ( ( U64 ) input_hi < < 32 ) ;
U64 const keyed = input_64 ^ ( XXH_readLE64 ( secret ) + seed ) ;
2019-06-18 18:07:06 +00:00
U64 const mix64 = len + ( ( keyed ^ ( keyed > > 51 ) ) * PRIME32_1 ) ;
return XXH3_avalanche ( ( mix64 ^ ( mix64 > > 47 ) ) * PRIME64_2 ) ;
2019-03-27 19:35:02 +00:00
}
}
2019-03-06 22:46:42 +00:00
XXH_FORCE_INLINE XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_9to16_64b ( const BYTE * input , size_t len , const BYTE * secret , XXH64_hash_t seed )
2019-02-26 20:36:23 +00:00
{
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( input ! = NULL ) ;
XXH_ASSERT ( secret ! = NULL ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( 9 < = len & & len < = 16 ) ;
2019-10-02 16:28:01 +00:00
{ U64 const input_lo = XXH_readLE64 ( input ) ^ ( XXH_readLE64 ( secret ) + seed ) ;
U64 const input_hi = XXH_readLE64 ( input + len - 8 ) ^ ( XXH_readLE64 ( secret + 8 ) - seed ) ;
U64 const acc = len + ( input_lo + input_hi ) + XXH3_mul128_fold64 ( input_lo , input_hi ) ;
2019-03-12 21:21:24 +00:00
return XXH3_avalanche ( acc ) ;
2019-02-26 21:45:56 +00:00
}
2019-02-26 20:36:23 +00:00
}
2019-03-06 22:46:42 +00:00
XXH_FORCE_INLINE XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_0to16_64b ( const BYTE * input , size_t len , const BYTE * secret , XXH64_hash_t seed )
2019-02-26 20:36:23 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( len < = 16 ) ;
2019-10-02 16:28:01 +00:00
{ if ( len > 8 ) return XXH3_len_9to16_64b ( input , len , secret , seed ) ;
if ( len > = 4 ) return XXH3_len_4to8_64b ( input , len , secret , seed ) ;
if ( len ) return XXH3_len_1to3_64b ( input , len , secret , seed ) ;
2019-06-11 23:32:17 +00:00
return 0 ;
2019-02-26 21:45:56 +00:00
}
2019-02-26 20:36:23 +00:00
}
2019-03-12 21:21:24 +00:00
/* === Long Keys === */
2019-02-26 20:36:23 +00:00
# define STRIPE_LEN 64
2019-06-12 00:55:23 +00:00
# define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
2019-02-26 20:36:23 +00:00
# define ACC_NB (STRIPE_LEN / sizeof(U64))
2019-07-19 23:21:17 +00:00
typedef enum { XXH3_acc_64bits , XXH3_acc_128bits } XXH3_accWidth_e ;
2019-02-26 23:24:59 +00:00
XXH_FORCE_INLINE void
2019-07-19 23:21:17 +00:00
XXH3_accumulate_512 ( void * XXH_RESTRICT acc ,
2019-10-02 16:28:01 +00:00
const void * XXH_RESTRICT input ,
const void * XXH_RESTRICT secret ,
2019-07-19 23:21:17 +00:00
XXH3_accWidth_e accWidth )
2019-02-26 20:36:23 +00:00
{
# if (XXH_VECTOR == XXH_AVX2)
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( ( ( ( size_t ) acc ) & 31 ) = = 0 ) ;
2019-06-10 18:40:23 +00:00
{ XXH_ALIGN ( 32 ) __m256i * const xacc = ( __m256i * ) acc ;
2019-10-02 16:28:01 +00:00
const __m256i * const xinput = ( const __m256i * ) input ; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
const __m256i * const xsecret = ( const __m256i * ) secret ; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
2019-02-26 21:45:56 +00:00
2019-03-12 18:44:44 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m256i ) ; i + + ) {
2019-10-02 16:28:01 +00:00
__m256i const data_vec = _mm256_loadu_si256 ( xinput + i ) ;
__m256i const key_vec = _mm256_loadu_si256 ( xsecret + i ) ;
__m256i const data_key = _mm256_xor_si256 ( data_vec , key_vec ) ; /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
__m256i const product = _mm256_mul_epu32 ( data_key , _mm256_shuffle_epi32 ( data_key , 0x31 ) ) ; /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
2019-07-19 23:21:17 +00:00
if ( accWidth = = XXH3_acc_128bits ) {
2019-10-02 16:28:01 +00:00
__m256i const data_swap = _mm256_shuffle_epi32 ( data_vec , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ;
__m256i const sum = _mm256_add_epi64 ( xacc [ i ] , data_swap ) ;
xacc [ i ] = _mm256_add_epi64 ( product , sum ) ;
2019-07-19 23:21:17 +00:00
} else { /* XXH3_acc_64bits */
2019-10-02 16:28:01 +00:00
__m256i const sum = _mm256_add_epi64 ( xacc [ i ] , data_vec ) ;
xacc [ i ] = _mm256_add_epi64 ( product , sum ) ;
2019-07-19 23:21:17 +00:00
}
2019-06-12 20:17:32 +00:00
} }
2019-02-26 20:36:23 +00:00
# elif (XXH_VECTOR == XXH_SSE2)
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( ( ( ( size_t ) acc ) & 15 ) = = 0 ) ;
2019-10-02 16:28:01 +00:00
{ XXH_ALIGN ( 16 ) __m128i * const xacc = ( __m128i * ) acc ;
const __m128i * const xinput = ( const __m128i * ) input ; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
const __m128i * const xsecret = ( const __m128i * ) secret ; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
2019-02-26 21:45:56 +00:00
2019-02-26 23:24:59 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m128i ) ; i + + ) {
2019-10-02 16:28:01 +00:00
__m128i const data_vec = _mm_loadu_si128 ( xinput + i ) ;
__m128i const key_vec = _mm_loadu_si128 ( xsecret + i ) ;
__m128i const data_key = _mm_xor_si128 ( data_vec , key_vec ) ; /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
__m128i const product = _mm_mul_epu32 ( data_key , _mm_shuffle_epi32 ( data_key , 0x31 ) ) ; /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
2019-07-19 23:21:17 +00:00
if ( accWidth = = XXH3_acc_128bits ) {
2019-10-02 16:28:01 +00:00
__m128i const data_swap = _mm_shuffle_epi32 ( data_vec , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ;
__m128i const sum = _mm_add_epi64 ( xacc [ i ] , data_swap ) ;
xacc [ i ] = _mm_add_epi64 ( product , sum ) ;
2019-07-19 23:21:17 +00:00
} else { /* XXH3_acc_64bits */
2019-10-02 16:28:01 +00:00
__m128i const sum = _mm_add_epi64 ( xacc [ i ] , data_vec ) ;
xacc [ i ] = _mm_add_epi64 ( product , sum ) ;
2019-07-19 23:21:17 +00:00
}
2019-06-12 20:17:32 +00:00
} }
2019-02-26 20:36:23 +00:00
2019-04-26 01:13:07 +00:00
# elif (XXH_VECTOR == XXH_NEON)
2019-03-01 01:28:29 +00:00
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( ( ( ( size_t ) acc ) & 15 ) = = 0 ) ;
2019-04-26 01:13:07 +00:00
{
2019-06-10 18:40:23 +00:00
XXH_ALIGN ( 16 ) uint64x2_t * const xacc = ( uint64x2_t * ) acc ;
2019-04-26 01:13:07 +00:00
/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
2019-10-02 16:28:01 +00:00
uint8_t const * const xinput = ( const uint8_t * ) input ;
uint8_t const * const xsecret = ( const uint8_t * ) secret ;
2019-03-01 01:28:29 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( uint64x2_t ) ; i + + ) {
2019-04-26 01:13:07 +00:00
# if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
2019-03-13 02:20:45 +00:00
/* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
* vzip on 32 - bit ARM NEON will overwrite the original register , and I think that Clang
* assumes I don ' t want to destroy it and tries to make a copy . This slows down the code
* a lot .
* aarch64 not only uses an entirely different syntax , but it requires three
* instructions . . .
* ext v1 .16 B , v0 .16 B , # 8 // select high bits because aarch64 can't address them directly
* zip1 v3 .2 s , v0 .2 s , v1 .2 s // first zip
* zip2 v2 .2 s , v0 .2 s , v1 .2 s // second zip
* . . . to do what ARM does in one :
* vzip .32 d0 , d1 // Interleave high and low bits and overwrite. */
2019-04-26 01:13:07 +00:00
2019-10-02 16:28:01 +00:00
/* data_vec = xsecret[i]; */
uint8x16_t const data_vec = vld1q_u8 ( xinput + ( i * 16 ) ) ;
/* key_vec = xsecret[i]; */
uint8x16_t const key_vec = vld1q_u8 ( xsecret + ( i * 16 ) ) ;
2019-04-26 01:13:07 +00:00
/* data_key = data_vec ^ key_vec; */
uint32x4_t data_key ;
2019-07-25 20:10:04 +00:00
if ( accWidth = = XXH3_acc_64bits ) {
/* Add first to prevent register swaps */
/* xacc[i] += data_vec; */
2019-10-01 22:52:21 +00:00
xacc [ i ] = vaddq_u64 ( xacc [ i ] , vreinterpretq_u64_u8 ( data_vec ) ) ;
2019-07-25 20:10:04 +00:00
} else { /* XXH3_acc_128bits */
/* xacc[i] += swap(data_vec); */
/* can probably be optimized better */
2019-10-01 22:52:21 +00:00
uint64x2_t const data64 = vreinterpretq_u64_u8 ( data_vec ) ;
2019-07-25 20:10:04 +00:00
uint64x2_t const swapped = vextq_u64 ( data64 , data64 , 1 ) ;
xacc [ i ] = vaddq_u64 ( xacc [ i ] , swapped ) ;
}
2019-04-26 01:13:07 +00:00
2019-10-01 22:52:21 +00:00
data_key = vreinterpretq_u32_u8 ( veorq_u8 ( data_vec , key_vec ) ) ;
2019-04-26 01:13:07 +00:00
/* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
* shuffle : data_key [ 0 , 1 , 2 , 3 ] = data_key [ 0 , 2 , 1 , 3 ] */
__asm__ ( " vzip.32 %e0, %f0 " : " +w " ( data_key ) ) ;
/* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
xacc [ i ] = vmlal_u32 ( xacc [ i ] , vget_low_u32 ( data_key ) , vget_high_u32 ( data_key ) ) ;
2019-07-19 23:21:17 +00:00
2019-03-13 02:20:45 +00:00
# else
/* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
2019-04-26 01:13:07 +00:00
2019-10-02 16:28:01 +00:00
/* data_vec = xsecret[i]; */
uint8x16_t const data_vec = vld1q_u8 ( xinput + ( i * 16 ) ) ;
/* key_vec = xsecret[i]; */
uint8x16_t const key_vec = vld1q_u8 ( xsecret + ( i * 16 ) ) ;
2019-04-26 01:13:07 +00:00
/* data_key = data_vec ^ key_vec; */
2019-10-01 22:52:21 +00:00
uint64x2_t const data_key = vreinterpretq_u64_u8 ( veorq_u8 ( data_vec , key_vec ) ) ;
2019-04-26 01:13:07 +00:00
/* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
2019-10-01 22:52:21 +00:00
uint32x2_t const data_key_lo = vmovn_u64 ( data_key ) ;
2019-04-26 01:13:07 +00:00
/* data_key_hi = (uint32x2_t) (data_key >> 32); */
2019-10-01 22:52:21 +00:00
uint32x2_t const data_key_hi = vshrn_n_u64 ( data_key , 32 ) ;
2019-07-25 20:03:10 +00:00
if ( accWidth = = XXH3_acc_64bits ) {
/* xacc[i] += data_vec; */
2019-10-01 22:52:21 +00:00
xacc [ i ] = vaddq_u64 ( xacc [ i ] , vreinterpretq_u64_u8 ( data_vec ) ) ;
2019-07-25 20:03:10 +00:00
} else { /* XXH3_acc_128bits */
/* xacc[i] += swap(data_vec); */
2019-10-01 22:52:21 +00:00
uint64x2_t const data64 = vreinterpretq_u64_u8 ( data_vec ) ;
2019-07-25 20:03:10 +00:00
uint64x2_t const swapped = vextq_u64 ( data64 , data64 , 1 ) ;
xacc [ i ] = vaddq_u64 ( xacc [ i ] , swapped ) ;
}
2019-04-26 01:13:07 +00:00
/* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
xacc [ i ] = vmlal_u32 ( xacc [ i ] , data_key_lo , data_key_hi ) ;
2019-07-19 23:21:17 +00:00
2019-03-13 02:20:45 +00:00
# endif
2019-03-01 01:28:29 +00:00
}
}
2019-03-12 18:44:44 +00:00
2019-08-21 01:06:11 +00:00
# elif (XXH_VECTOR == XXH_VSX)
2019-06-12 20:17:32 +00:00
U64x2 * const xacc = ( U64x2 * ) acc ; /* presumed aligned */
2019-10-02 16:28:01 +00:00
U64x2 const * const xinput = ( U64x2 const * ) input ; /* no alignment restriction */
U64x2 const * const xsecret = ( U64x2 const * ) secret ; /* no alignment restriction */
2019-04-26 19:56:26 +00:00
U64x2 const v32 = { 32 , 32 } ;
2019-08-21 01:06:11 +00:00
# if XXH_VSX_BE
U8x16 const vXorSwap = { 0x07 , 0x16 , 0x25 , 0x34 , 0x43 , 0x52 , 0x61 , 0x70 ,
0x8F , 0x9E , 0xAD , 0xBC , 0xCB , 0xDA , 0xE9 , 0xF8 } ;
# endif
2019-04-26 19:56:26 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( U64x2 ) ; i + + ) {
2019-10-02 16:28:01 +00:00
/* data_vec = xinput[i]; */
/* key_vec = xsecret[i]; */
2019-08-21 01:06:11 +00:00
# if XXH_VSX_BE
2019-04-26 19:56:26 +00:00
/* byteswap */
2019-10-02 16:28:01 +00:00
U64x2 const data_vec = XXH_vec_revb ( vec_vsx_ld ( 0 , xinput + i ) ) ;
U64x2 const key_raw = vec_vsx_ld ( 0 , xsecret + i ) ;
/* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
2019-08-21 01:06:11 +00:00
U64x2 const data_key = ( U64x2 ) XXH_vec_permxor ( ( U8x16 ) data_vec , ( U8x16 ) key_raw , vXorSwap ) ;
2019-04-26 19:56:26 +00:00
# else
2019-10-02 16:28:01 +00:00
U64x2 const data_vec = vec_vsx_ld ( 0 , xinput + i ) ;
U64x2 const key_vec = vec_vsx_ld ( 0 , xsecret + i ) ;
2019-08-15 13:35:53 +00:00
U64x2 const data_key = data_vec ^ key_vec ;
2019-08-21 01:06:11 +00:00
# endif
2019-04-26 19:56:26 +00:00
/* shuffled = (data_key << 32) | (data_key >> 32); */
2019-08-15 13:35:53 +00:00
U32x4 const shuffled = ( U32x4 ) vec_rl ( data_key , v32 ) ;
2019-04-26 19:56:26 +00:00
/* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
2019-08-21 01:06:11 +00:00
U64x2 const product = XXH_vec_mulo ( ( U32x4 ) data_key , shuffled ) ;
2019-04-26 19:56:26 +00:00
xacc [ i ] + = product ;
2019-08-15 13:35:53 +00:00
if ( accWidth = = XXH3_acc_64bits ) {
xacc [ i ] + = data_vec ;
} else { /* XXH3_acc_128bits */
2019-08-21 01:06:11 +00:00
/* swap high and low halves */
U64x2 const data_swapped = vec_xxpermdi ( data_vec , data_vec , 2 ) ;
2019-08-15 13:35:53 +00:00
xacc [ i ] + = data_swapped ;
}
2019-04-26 19:56:26 +00:00
}
2019-06-12 00:55:23 +00:00
2019-03-25 22:25:24 +00:00
# else /* scalar variant of Accumulator - universal */
2019-04-26 01:13:07 +00:00
2019-07-25 22:55:03 +00:00
XXH_ALIGN ( XXH_ACC_ALIGN ) U64 * const xacc = ( U64 * ) acc ; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
2019-10-02 16:28:01 +00:00
const BYTE * const xinput = ( const BYTE * ) input ; /* no alignment restriction */
const BYTE * const xsecret = ( const BYTE * ) secret ; /* no alignment restriction */
2019-04-26 01:13:07 +00:00
size_t i ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( ( ( size_t ) acc & ( XXH_ACC_ALIGN - 1 ) ) = = 0 ) ;
2019-07-19 23:21:17 +00:00
for ( i = 0 ; i < ACC_NB ; i + = 2 ) {
2019-10-02 16:28:01 +00:00
U64 const in1 = XXH_readLE64 ( xinput + 8 * i ) ;
U64 const in2 = XXH_readLE64 ( xinput + 8 * ( i + 1 ) ) ;
U64 const key1 = XXH_readLE64 ( xsecret + 8 * i ) ;
U64 const key2 = XXH_readLE64 ( xsecret + 8 * ( i + 1 ) ) ;
2019-07-19 23:21:17 +00:00
U64 const data_key1 = key1 ^ in1 ;
U64 const data_key2 = key2 ^ in2 ;
xacc [ i ] + = XXH_mult32to64 ( data_key1 & 0xFFFFFFFF , data_key1 > > 32 ) ;
xacc [ i + 1 ] + = XXH_mult32to64 ( data_key2 & 0xFFFFFFFF , data_key2 > > 32 ) ;
if ( accWidth = = XXH3_acc_128bits ) {
xacc [ i ] + = in2 ;
xacc [ i + 1 ] + = in1 ;
} else { /* XXH3_acc_64bits */
xacc [ i ] + = in1 ;
xacc [ i + 1 ] + = in2 ;
}
2019-02-26 20:36:23 +00:00
}
# endif
}
2019-06-13 01:09:04 +00:00
XXH_FORCE_INLINE void
2019-10-02 16:28:01 +00:00
XXH3_scrambleAcc ( void * XXH_RESTRICT acc , const void * XXH_RESTRICT secret )
2019-02-26 20:36:23 +00:00
{
# if (XXH_VECTOR == XXH_AVX2)
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( ( ( ( size_t ) acc ) & 31 ) = = 0 ) ;
2019-06-10 18:40:23 +00:00
{ XXH_ALIGN ( 32 ) __m256i * const xacc = ( __m256i * ) acc ;
2019-10-02 16:28:01 +00:00
const __m256i * const xsecret = ( const __m256i * ) secret ; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
2019-07-18 00:53:10 +00:00
const __m256i prime32 = _mm256_set1_epi32 ( ( int ) PRIME32_1 ) ;
2019-02-26 21:45:56 +00:00
2019-03-12 18:44:44 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m256i ) ; i + + ) {
2019-10-02 16:28:01 +00:00
/* xacc[i] ^= (xacc[i] >> 47) */
__m256i const acc_vec = xacc [ i ] ;
__m256i const shifted = _mm256_srli_epi64 ( acc_vec , 47 ) ;
__m256i const data_vec = _mm256_xor_si256 ( acc_vec , shifted ) ;
/* xacc[i] ^= xsecret; */
__m256i const key_vec = _mm256_loadu_si256 ( xsecret + i ) ;
__m256i const data_key = _mm256_xor_si256 ( data_vec , key_vec ) ;
/* xacx[i] *= PRIME32_1; */
__m256i const data_key_hi = _mm256_shuffle_epi32 ( data_key , 0x31 ) ;
__m256i const prod_lo = _mm256_mul_epu32 ( data_key , prime32 ) ;
__m256i const prod_hi = _mm256_mul_epu32 ( data_key_hi , prime32 ) ;
xacc [ i ] = _mm256_add_epi64 ( prod_lo , _mm256_slli_epi64 ( prod_hi , 32 ) ) ;
}
2019-02-26 20:36:23 +00:00
}
# elif (XXH_VECTOR == XXH_SSE2)
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( ( ( ( size_t ) acc ) & 15 ) = = 0 ) ;
2019-10-01 23:00:28 +00:00
{ XXH_ALIGN ( 16 ) __m128i * const xacc = ( __m128i * ) acc ;
2019-10-02 16:28:01 +00:00
const __m128i * const xsecret = ( const __m128i * ) secret ; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
2019-07-18 00:53:10 +00:00
const __m128i prime32 = _mm_set1_epi32 ( ( int ) PRIME32_1 ) ;
2019-02-26 20:36:23 +00:00
2019-02-26 23:24:59 +00:00
size_t i ;
for ( i = 0 ; i < STRIPE_LEN / sizeof ( __m128i ) ; i + + ) {
2019-10-02 16:28:01 +00:00
/* xacc[i] ^= (xacc[i] >> 47) */
__m128i const acc_vec = xacc [ i ] ;
__m128i const shifted = _mm_srli_epi64 ( acc_vec , 47 ) ;
__m128i const data_vec = _mm_xor_si128 ( acc_vec , shifted ) ;
/* xacc[i] ^= xsecret; */
__m128i const key_vec = _mm_loadu_si128 ( xsecret + i ) ;
__m128i const data_key = _mm_xor_si128 ( data_vec , key_vec ) ;
/* xacx[i] *= PRIME32_1; */
__m128i const data_key_hi = _mm_shuffle_epi32 ( data_key , 0x31 ) ;
__m128i const prod_lo = _mm_mul_epu32 ( data_key , prime32 ) ;
__m128i const prod_hi = _mm_mul_epu32 ( data_key_hi , prime32 ) ;
xacc [ i ] = _mm_add_epi64 ( prod_lo , _mm_slli_epi64 ( prod_hi , 32 ) ) ;
}
2019-02-26 20:36:23 +00:00
}
2019-04-26 01:13:07 +00:00
# elif (XXH_VECTOR == XXH_NEON)
2019-03-01 01:28:29 +00:00
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( ( ( ( size_t ) acc ) & 15 ) = = 0 ) ;
2019-03-13 02:20:45 +00:00
2019-06-12 00:55:23 +00:00
{ uint64x2_t * const xacc = ( uint64x2_t * ) acc ;
2019-10-02 16:28:01 +00:00
uint8_t const * const xsecret = ( uint8_t const * ) secret ;
2019-04-26 01:13:07 +00:00
uint32x2_t const prime = vdup_n_u32 ( PRIME32_1 ) ;
2019-03-13 02:20:45 +00:00
2019-04-26 01:13:07 +00:00
size_t i ;
2019-03-01 01:28:29 +00:00
for ( i = 0 ; i < STRIPE_LEN / sizeof ( uint64x2_t ) ; i + + ) {
2019-04-26 01:13:07 +00:00
/* data_vec = xacc[i] ^ (xacc[i] >> 47); */
uint64x2_t const acc_vec = xacc [ i ] ;
uint64x2_t const shifted = vshrq_n_u64 ( acc_vec , 47 ) ;
uint64x2_t const data_vec = veorq_u64 ( acc_vec , shifted ) ;
2019-10-02 16:28:01 +00:00
/* key_vec = xsecret[i]; */
uint32x4_t const key_vec = vreinterpretq_u32_u8 ( vld1q_u8 ( xsecret + ( i * 16 ) ) ) ;
2019-04-26 01:13:07 +00:00
/* data_key = data_vec ^ key_vec; */
uint32x4_t const data_key = veorq_u32 ( vreinterpretq_u32_u64 ( data_vec ) , key_vec ) ;
/* shuffled = { data_key[0, 2], data_key[1, 3] }; */
uint32x2x2_t const shuffled = vzip_u32 ( vget_low_u32 ( data_key ) , vget_high_u32 ( data_key ) ) ;
/* data_key *= PRIME32_1 */
/* prod_hi = (data_key >> 32) * PRIME32_1; */
uint64x2_t const prod_hi = vmull_u32 ( shuffled . val [ 1 ] , prime ) ;
/* xacc[i] = prod_hi << 32; */
xacc [ i ] = vshlq_n_u64 ( prod_hi , 32 ) ;
/* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
xacc [ i ] = vmlal_u32 ( xacc [ i ] , shuffled . val [ 0 ] , prime ) ;
2019-06-12 00:55:23 +00:00
} }
2019-03-01 01:28:29 +00:00
2019-04-26 19:56:26 +00:00
# elif (XXH_VECTOR == XXH_VSX)
2019-06-12 00:55:23 +00:00
2019-04-26 19:56:26 +00:00
U64x2 * const xacc = ( U64x2 * ) acc ;
2019-10-02 16:28:01 +00:00
const U64x2 * const xsecret = ( const U64x2 * ) secret ;
2019-04-26 19:56:26 +00:00
/* constants */
U64x2 const v32 = { 32 , 32 } ;
U64x2 const v47 = { 47 , 47 } ;
U32x4 const prime = { PRIME32_1 , PRIME32_1 , PRIME32_1 , PRIME32_1 } ;
size_t i ;
2019-08-21 01:06:11 +00:00
# if XXH_VSX_BE
/* endian swap */
U8x16 const vXorSwap = { 0x07 , 0x16 , 0x25 , 0x34 , 0x43 , 0x52 , 0x61 , 0x70 ,
0x8F , 0x9E , 0xAD , 0xBC , 0xCB , 0xDA , 0xE9 , 0xF8 } ;
# endif
2019-04-26 19:56:26 +00:00
for ( i = 0 ; i < STRIPE_LEN / sizeof ( U64x2 ) ; i + + ) {
U64x2 const acc_vec = xacc [ i ] ;
U64x2 const data_vec = acc_vec ^ ( acc_vec > > v47 ) ;
2019-10-02 16:28:01 +00:00
/* key_vec = xsecret[i]; */
2019-08-21 01:06:11 +00:00
# if XXH_VSX_BE
/* swap bytes words */
2019-10-02 16:28:01 +00:00
U64x2 const key_raw = vec_vsx_ld ( 0 , xsecret + i ) ;
2019-08-21 01:06:11 +00:00
U64x2 const data_key = ( U64x2 ) XXH_vec_permxor ( ( U8x16 ) data_vec , ( U8x16 ) key_raw , vXorSwap ) ;
2019-04-26 19:56:26 +00:00
# else
2019-10-02 16:28:01 +00:00
U64x2 const key_vec = vec_vsx_ld ( 0 , xsecret + i ) ;
2019-04-26 19:56:26 +00:00
U64x2 const data_key = data_vec ^ key_vec ;
2019-08-21 01:06:11 +00:00
# endif
2019-04-26 19:56:26 +00:00
/* data_key *= PRIME32_1 */
/* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */
2019-08-21 01:06:11 +00:00
U64x2 const prod_even = XXH_vec_mule ( ( U32x4 ) data_key , prime ) ;
2019-04-26 19:56:26 +00:00
/* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */
2019-08-21 01:06:11 +00:00
U64x2 const prod_odd = XXH_vec_mulo ( ( U32x4 ) data_key , prime ) ;
xacc [ i ] = prod_odd + ( prod_even < < v32 ) ;
2019-04-26 19:56:26 +00:00
}
2019-02-26 20:36:23 +00:00
2019-03-25 22:25:24 +00:00
# else /* scalar variant of Scrambler - universal */
2019-02-26 20:36:23 +00:00
2019-07-25 22:55:03 +00:00
XXH_ALIGN ( XXH_ACC_ALIGN ) U64 * const xacc = ( U64 * ) acc ; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
2019-10-02 16:28:01 +00:00
const BYTE * const xsecret = ( const BYTE * ) secret ; /* no alignment restriction */
2019-10-01 22:52:21 +00:00
size_t i ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( ( ( ( size_t ) acc ) & ( XXH_ACC_ALIGN - 1 ) ) = = 0 ) ;
2019-06-13 20:45:07 +00:00
2019-10-01 22:52:21 +00:00
for ( i = 0 ; i < ACC_NB ; i + + ) {
2019-10-02 16:28:01 +00:00
U64 const key64 = XXH_readLE64 ( xsecret + 8 * i ) ;
2019-03-25 22:25:24 +00:00
U64 acc64 = xacc [ i ] ;
acc64 ^ = acc64 > > 47 ;
2019-03-25 22:33:09 +00:00
acc64 ^ = key64 ;
2019-03-25 22:25:24 +00:00
acc64 * = PRIME32_1 ;
xacc [ i ] = acc64 ;
}
2019-02-26 20:36:23 +00:00
# endif
}
2019-06-11 17:51:09 +00:00
/* assumption : nbStripes will not overflow secret size */
2019-06-13 01:09:04 +00:00
XXH_FORCE_INLINE void
2019-07-19 23:21:17 +00:00
XXH3_accumulate ( U64 * XXH_RESTRICT acc ,
2019-10-02 16:28:01 +00:00
const BYTE * XXH_RESTRICT input ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret ,
2019-07-19 23:21:17 +00:00
size_t nbStripes ,
XXH3_accWidth_e accWidth )
2019-02-26 20:36:23 +00:00
{
2019-02-26 23:24:59 +00:00
size_t n ;
for ( n = 0 ; n < nbStripes ; n + + ) {
2019-04-30 00:44:45 +00:00
XXH3_accumulate_512 ( acc ,
2019-10-02 16:28:01 +00:00
input + n * STRIPE_LEN ,
2019-10-01 22:52:21 +00:00
secret + n * XXH_SECRET_CONSUME_RATE ,
2019-07-19 23:21:17 +00:00
accWidth ) ;
2019-02-26 20:36:23 +00:00
}
}
2019-06-14 16:47:18 +00:00
/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
* and doesn ' t auto - vectorize it at all if it is ` FORCE_INLINE ` .
* However , it auto - vectorizes better AVX2 if it is ` FORCE_INLINE `
* Pretty much every other modes and compilers prefer ` FORCE_INLINE ` .
2019-06-13 21:27:19 +00:00
*/
2019-06-14 16:47:18 +00:00
# if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__)
2019-03-08 20:37:06 +00:00
static void
2019-06-14 16:47:18 +00:00
# else
XXH_FORCE_INLINE void
# endif
2019-07-09 22:39:04 +00:00
XXH3_hashLong_internal_loop ( U64 * XXH_RESTRICT acc ,
2019-10-02 16:28:01 +00:00
const BYTE * XXH_RESTRICT input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret , size_t secretSize ,
2019-07-19 23:21:17 +00:00
XXH3_accWidth_e accWidth )
2019-02-26 20:36:23 +00:00
{
2019-06-12 00:55:23 +00:00
size_t const nb_rounds = ( secretSize - STRIPE_LEN ) / XXH_SECRET_CONSUME_RATE ;
2019-06-11 17:51:09 +00:00
size_t const block_len = STRIPE_LEN * nb_rounds ;
2019-02-26 20:36:23 +00:00
size_t const nb_blocks = len / block_len ;
2019-02-26 23:24:59 +00:00
size_t n ;
2019-06-11 17:51:09 +00:00
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = XXH3_SECRET_SIZE_MIN ) ;
2019-06-11 17:51:09 +00:00
2019-02-26 23:24:59 +00:00
for ( n = 0 ; n < nb_blocks ; n + + ) {
2019-10-02 16:28:01 +00:00
XXH3_accumulate ( acc , input + n * block_len , secret , nb_rounds , accWidth ) ;
2019-10-01 22:52:21 +00:00
XXH3_scrambleAcc ( acc , secret + secretSize - STRIPE_LEN ) ;
2019-02-26 20:36:23 +00:00
}
/* last partial block */
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( len > STRIPE_LEN ) ;
2019-06-11 17:51:09 +00:00
{ size_t const nbStripes = ( len - ( block_len * nb_blocks ) ) / STRIPE_LEN ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( nbStripes < = ( secretSize / XXH_SECRET_CONSUME_RATE ) ) ;
2019-10-02 16:28:01 +00:00
XXH3_accumulate ( acc , input + nb_blocks * block_len , secret , nbStripes , accWidth ) ;
2019-02-26 21:45:56 +00:00
/* last stripe */
if ( len & ( STRIPE_LEN - 1 ) ) {
2019-10-02 16:28:01 +00:00
const BYTE * const p = input + len - STRIPE_LEN ;
2019-06-13 21:33:21 +00:00
# define XXH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */
2019-10-01 22:52:21 +00:00
XXH3_accumulate_512 ( acc , p , secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START , accWidth ) ;
2019-02-26 21:45:56 +00:00
} }
2019-03-08 20:37:06 +00:00
}
2019-06-13 21:27:19 +00:00
XXH_FORCE_INLINE U64
2019-10-01 22:52:21 +00:00
XXH3_mix2Accs ( const U64 * XXH_RESTRICT acc , const BYTE * XXH_RESTRICT secret )
2019-03-12 21:21:24 +00:00
{
2019-03-18 02:05:14 +00:00
return XXH3_mul128_fold64 (
2019-10-01 22:52:21 +00:00
acc [ 0 ] ^ XXH_readLE64 ( secret ) ,
acc [ 1 ] ^ XXH_readLE64 ( secret + 8 ) ) ;
2019-03-12 21:21:24 +00:00
}
2019-06-13 21:27:19 +00:00
static XXH64_hash_t
2019-10-01 22:52:21 +00:00
XXH3_mergeAccs ( const U64 * XXH_RESTRICT acc , const BYTE * XXH_RESTRICT secret , U64 start )
2019-03-12 21:21:24 +00:00
{
U64 result64 = start ;
2019-10-01 22:52:21 +00:00
result64 + = XXH3_mix2Accs ( acc + 0 , secret + 0 ) ;
result64 + = XXH3_mix2Accs ( acc + 2 , secret + 16 ) ;
result64 + = XXH3_mix2Accs ( acc + 4 , secret + 32 ) ;
result64 + = XXH3_mix2Accs ( acc + 6 , secret + 48 ) ;
2019-03-12 21:21:24 +00:00
return XXH3_avalanche ( result64 ) ;
}
2019-07-20 00:28:09 +00:00
# define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
PRIME64_4 , PRIME32_2 , PRIME64_5 , PRIME32_1 } ;
2019-06-13 01:09:04 +00:00
XXH_FORCE_INLINE XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_hashLong_internal ( const BYTE * XXH_RESTRICT input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret , size_t secretSize )
2019-03-17 20:38:04 +00:00
{
2019-07-20 00:28:09 +00:00
XXH_ALIGN ( XXH_ACC_ALIGN ) U64 acc [ ACC_NB ] = XXH3_INIT_ACC ;
2019-06-11 23:05:23 +00:00
2019-10-02 16:28:01 +00:00
XXH3_hashLong_internal_loop ( acc , input , len , secret , secretSize , XXH3_acc_64bits ) ;
2019-06-11 23:05:23 +00:00
/* converge into final hash */
XXH_STATIC_ASSERT ( sizeof ( acc ) = = 64 ) ;
2019-06-13 21:33:21 +00:00
# define XXH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = sizeof ( acc ) + XXH_SECRET_MERGEACCS_START ) ;
2019-10-01 22:52:21 +00:00
return XXH3_mergeAccs ( acc , secret + XXH_SECRET_MERGEACCS_START , ( U64 ) len * PRIME64_1 ) ;
2019-03-17 20:38:04 +00:00
}
2019-06-13 01:09:04 +00:00
XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
2019-10-02 16:28:01 +00:00
XXH3_hashLong_64b_defaultSecret ( const BYTE * XXH_RESTRICT input , size_t len )
2019-04-30 00:44:45 +00:00
{
2019-10-02 16:28:01 +00:00
return XXH3_hashLong_internal ( input , len , kSecret , sizeof ( kSecret ) ) ;
2019-04-30 00:44:45 +00:00
}
2019-03-16 13:59:46 +00:00
XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
2019-10-02 16:28:01 +00:00
XXH3_hashLong_64b_withSecret ( const BYTE * XXH_RESTRICT input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret , size_t secretSize )
2019-03-08 20:37:06 +00:00
{
2019-10-02 16:28:01 +00:00
return XXH3_hashLong_internal ( input , len , secret , secretSize ) ;
2019-06-13 01:09:04 +00:00
}
2019-03-17 20:38:04 +00:00
2019-03-08 20:37:06 +00:00
2019-06-14 18:34:24 +00:00
XXH_FORCE_INLINE void XXH_writeLE64 ( void * dst , U64 v64 )
{
if ( ! XXH_CPU_LITTLE_ENDIAN ) v64 = XXH_swap64 ( v64 ) ;
memcpy ( dst , & v64 , sizeof ( v64 ) ) ;
}
2019-10-02 16:28:01 +00:00
/* XXH3_initCustomSecret() :
2019-06-14 18:34:24 +00:00
* destination ` customSecret ` is presumed allocated and same size as ` kSecret ` .
2019-06-12 00:26:30 +00:00
*/
2019-10-02 16:28:01 +00:00
XXH_FORCE_INLINE void XXH3_initCustomSecret ( BYTE * customSecret , U64 seed64 )
2019-03-17 20:38:04 +00:00
{
2019-06-14 18:34:24 +00:00
int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16 ;
2019-03-17 20:38:04 +00:00
int i ;
2019-02-26 20:36:23 +00:00
2019-06-12 20:33:31 +00:00
XXH_STATIC_ASSERT ( ( XXH_SECRET_DEFAULT_SIZE & 15 ) = = 0 ) ;
2019-06-12 18:33:40 +00:00
2019-06-14 18:34:24 +00:00
for ( i = 0 ; i < nbRounds ; i + + ) {
2019-10-01 22:52:21 +00:00
XXH_writeLE64 ( customSecret + 16 * i , XXH_readLE64 ( kSecret + 16 * i ) + seed64 ) ;
XXH_writeLE64 ( customSecret + 16 * i + 8 , XXH_readLE64 ( kSecret + 16 * i + 8 ) - seed64 ) ;
2019-03-17 20:38:04 +00:00
}
}
2019-06-14 18:34:24 +00:00
2019-06-11 23:29:57 +00:00
/* XXH3_hashLong_64b_withSeed() :
* Generate a custom key ,
2019-06-12 20:33:31 +00:00
* based on alteration of default kSecret with the seed ,
2019-06-11 23:29:57 +00:00
* and then use this key for long mode hashing .
* This operation is decently fast but nonetheless costs a little bit of time .
* Try to avoid it whenever possible ( typically when seed = = 0 ) .
*/
2019-03-16 13:59:46 +00:00
XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
2019-10-02 16:28:01 +00:00
XXH3_hashLong_64b_withSeed ( const BYTE * input , size_t len , XXH64_hash_t seed )
2019-03-08 20:37:06 +00:00
{
2019-10-01 22:52:21 +00:00
XXH_ALIGN ( 8 ) BYTE secret [ XXH_SECRET_DEFAULT_SIZE ] ;
2019-10-02 16:28:01 +00:00
if ( seed = = 0 ) return XXH3_hashLong_64b_defaultSecret ( input , len ) ;
XXH3_initCustomSecret ( secret , seed ) ;
return XXH3_hashLong_internal ( input , len , secret , sizeof ( secret ) ) ;
2019-02-26 20:36:23 +00:00
}
2019-10-02 16:28:01 +00:00
XXH_FORCE_INLINE U64 XXH3_mix16B ( const BYTE * XXH_RESTRICT input ,
const BYTE * XXH_RESTRICT secret , U64 seed64 )
2019-03-18 02:05:14 +00:00
{
2019-10-02 16:28:01 +00:00
U64 const input_lo = XXH_readLE64 ( input ) ;
U64 const input_hi = XXH_readLE64 ( input + 8 ) ;
2019-03-18 02:05:14 +00:00
return XXH3_mul128_fold64 (
2019-10-02 16:28:01 +00:00
input_lo ^ ( XXH_readLE64 ( secret ) + seed64 ) ,
input_hi ^ ( XXH_readLE64 ( secret + 8 ) - seed64 ) ) ;
2019-03-18 02:05:14 +00:00
}
2019-06-11 22:38:34 +00:00
XXH_FORCE_INLINE XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_17to128_64b ( const BYTE * XXH_RESTRICT input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret , size_t secretSize ,
2019-07-18 00:53:10 +00:00
XXH64_hash_t seed )
2019-02-26 20:36:23 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = XXH3_SECRET_SIZE_MIN ) ; ( void ) secretSize ;
XXH_ASSERT ( 16 < len & & len < = 128 ) ;
2019-03-06 16:55:48 +00:00
2019-03-25 02:08:54 +00:00
{ U64 acc = len * PRIME64_1 ;
2019-03-06 16:55:48 +00:00
if ( len > 32 ) {
if ( len > 64 ) {
if ( len > 96 ) {
2019-10-02 16:28:01 +00:00
acc + = XXH3_mix16B ( input + 48 , secret + 96 , seed ) ;
acc + = XXH3_mix16B ( input + len - 64 , secret + 112 , seed ) ;
2019-03-06 16:55:48 +00:00
}
2019-10-02 16:28:01 +00:00
acc + = XXH3_mix16B ( input + 32 , secret + 64 , seed ) ;
acc + = XXH3_mix16B ( input + len - 48 , secret + 80 , seed ) ;
2019-03-06 16:55:48 +00:00
}
2019-10-02 16:28:01 +00:00
acc + = XXH3_mix16B ( input + 16 , secret + 32 , seed ) ;
acc + = XXH3_mix16B ( input + len - 32 , secret + 48 , seed ) ;
2019-03-06 16:55:48 +00:00
}
2019-10-02 16:28:01 +00:00
acc + = XXH3_mix16B ( input + 0 , secret + 0 , seed ) ;
acc + = XXH3_mix16B ( input + len - 16 , secret + 16 , seed ) ;
2019-03-06 16:55:48 +00:00
2019-03-12 21:21:24 +00:00
return XXH3_avalanche ( acc ) ;
2019-02-26 20:36:23 +00:00
}
}
2019-07-19 23:44:53 +00:00
# define XXH3_MIDSIZE_MAX 240
2019-06-17 19:38:49 +00:00
XXH_NO_INLINE XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_129to240_64b ( const BYTE * XXH_RESTRICT input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret , size_t secretSize ,
2019-07-18 00:53:10 +00:00
XXH64_hash_t seed )
2019-06-17 19:38:49 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = XXH3_SECRET_SIZE_MIN ) ; ( void ) secretSize ;
XXH_ASSERT ( 128 < len & & len < = XXH3_MIDSIZE_MAX ) ;
2019-06-17 19:38:49 +00:00
# define XXH3_MIDSIZE_STARTOFFSET 3
2019-07-17 22:46:16 +00:00
# define XXH3_MIDSIZE_LASTOFFSET 17
2019-06-17 19:38:49 +00:00
{ U64 acc = len * PRIME64_1 ;
int const nbRounds = ( int ) len / 16 ;
int i ;
for ( i = 0 ; i < 8 ; i + + ) {
2019-10-02 16:28:01 +00:00
acc + = XXH3_mix16B ( input + ( 16 * i ) , secret + ( 16 * i ) , seed ) ;
2019-06-17 19:38:49 +00:00
}
acc = XXH3_avalanche ( acc ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( nbRounds > = 8 ) ;
2019-06-17 19:38:49 +00:00
for ( i = 8 ; i < nbRounds ; i + + ) {
2019-10-02 16:28:01 +00:00
acc + = XXH3_mix16B ( input + ( 16 * i ) , secret + ( 16 * ( i - 8 ) ) + XXH3_MIDSIZE_STARTOFFSET , seed ) ;
2019-06-17 19:38:49 +00:00
}
/* last bytes */
2019-10-02 16:28:01 +00:00
acc + = XXH3_mix16B ( input + len - 16 , secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET , seed ) ;
2019-06-17 19:38:49 +00:00
return XXH3_avalanche ( acc ) ;
}
}
2019-06-13 01:09:04 +00:00
/* === Public entry point === */
2019-10-02 16:28:01 +00:00
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits ( const void * input , size_t len )
2019-03-06 22:46:42 +00:00
{
2019-10-02 16:28:01 +00:00
if ( len < = 16 ) return XXH3_len_0to16_64b ( ( const BYTE * ) input , len , kSecret , 0 ) ;
if ( len < = 128 ) return XXH3_len_17to128_64b ( ( const BYTE * ) input , len , kSecret , sizeof ( kSecret ) , 0 ) ;
if ( len < = XXH3_MIDSIZE_MAX ) return XXH3_len_129to240_64b ( ( const BYTE * ) input , len , kSecret , sizeof ( kSecret ) , 0 ) ;
return XXH3_hashLong_64b_defaultSecret ( ( const BYTE * ) input , len ) ;
2019-03-06 22:46:42 +00:00
}
2019-06-11 22:51:48 +00:00
XXH_PUBLIC_API XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_64bits_withSecret ( const void * input , size_t len , const void * secret , size_t secretSize )
2019-06-11 22:51:48 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = XXH3_SECRET_SIZE_MIN ) ;
2019-06-11 23:05:23 +00:00
/* if an action must be taken should `secret` conditions not be respected,
* it should be done here .
* For now , it ' s a contract pre - condition .
* Adding a check and a branch here would cost performance at every hash */
2019-10-02 16:28:01 +00:00
if ( len < = 16 ) return XXH3_len_0to16_64b ( ( const BYTE * ) input , len , ( const BYTE * ) secret , 0 ) ;
if ( len < = 128 ) return XXH3_len_17to128_64b ( ( const BYTE * ) input , len , ( const BYTE * ) secret , secretSize , 0 ) ;
if ( len < = XXH3_MIDSIZE_MAX ) return XXH3_len_129to240_64b ( ( const BYTE * ) input , len , ( const BYTE * ) secret , secretSize , 0 ) ;
return XXH3_hashLong_64b_withSecret ( ( const BYTE * ) input , len , ( const BYTE * ) secret , secretSize ) ;
2019-06-11 22:51:48 +00:00
}
2019-02-26 21:45:56 +00:00
2019-06-12 17:35:20 +00:00
XXH_PUBLIC_API XXH64_hash_t
2019-10-02 16:28:01 +00:00
XXH3_64bits_withSeed ( const void * input , size_t len , XXH64_hash_t seed )
2019-06-12 17:35:20 +00:00
{
2019-10-02 16:28:01 +00:00
if ( len < = 16 ) return XXH3_len_0to16_64b ( ( const BYTE * ) input , len , kSecret , seed ) ;
if ( len < = 128 ) return XXH3_len_17to128_64b ( ( const BYTE * ) input , len , kSecret , sizeof ( kSecret ) , seed ) ;
if ( len < = XXH3_MIDSIZE_MAX ) return XXH3_len_129to240_64b ( ( const BYTE * ) input , len , kSecret , sizeof ( kSecret ) , seed ) ;
return XXH3_hashLong_64b_withSeed ( ( const BYTE * ) input , len , seed ) ;
2019-06-12 17:35:20 +00:00
}
2019-03-06 22:46:42 +00:00
2019-05-05 03:10:52 +00:00
/* === XXH3 streaming === */
2019-07-20 00:28:09 +00:00
XXH_PUBLIC_API XXH3_state_t * XXH3_createState ( void )
2019-05-05 03:10:52 +00:00
{
return ( XXH3_state_t * ) XXH_malloc ( sizeof ( XXH3_state_t ) ) ;
}
2019-07-20 00:28:09 +00:00
XXH_PUBLIC_API XXH_errorcode XXH3_freeState ( XXH3_state_t * statePtr )
2019-05-05 03:10:52 +00:00
{
XXH_free ( statePtr ) ;
return XXH_OK ;
}
XXH_PUBLIC_API void
2019-07-20 00:28:09 +00:00
XXH3_copyState ( XXH3_state_t * dst_state , const XXH3_state_t * src_state )
2019-05-05 03:10:52 +00:00
{
memcpy ( dst_state , src_state , sizeof ( * dst_state ) ) ;
}
2019-06-14 01:22:46 +00:00
static void
2019-06-14 18:34:24 +00:00
XXH3_64bits_reset_internal ( XXH3_state_t * statePtr ,
XXH64_hash_t seed ,
2019-10-01 22:52:21 +00:00
const BYTE * secret , size_t secretSize )
2019-06-14 01:22:46 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( statePtr ! = NULL ) ;
2019-06-14 18:34:24 +00:00
memset ( statePtr , 0 , sizeof ( * statePtr ) ) ;
2019-06-14 01:22:46 +00:00
statePtr - > acc [ 0 ] = PRIME32_3 ;
statePtr - > acc [ 1 ] = PRIME64_1 ;
statePtr - > acc [ 2 ] = PRIME64_2 ;
statePtr - > acc [ 3 ] = PRIME64_3 ;
statePtr - > acc [ 4 ] = PRIME64_4 ;
statePtr - > acc [ 5 ] = PRIME32_2 ;
statePtr - > acc [ 6 ] = PRIME64_5 ;
statePtr - > acc [ 7 ] = PRIME32_1 ;
2019-06-14 18:34:24 +00:00
statePtr - > seed = seed ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secret ! = NULL ) ;
2019-06-14 18:34:24 +00:00
statePtr - > secret = secret ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = XXH3_SECRET_SIZE_MIN ) ;
2019-06-14 18:34:24 +00:00
statePtr - > secretLimit = ( XXH32_hash_t ) ( secretSize - STRIPE_LEN ) ;
statePtr - > nbStripesPerBlock = statePtr - > secretLimit / XXH_SECRET_CONSUME_RATE ;
2019-06-14 01:22:46 +00:00
}
2019-05-05 03:10:52 +00:00
XXH_PUBLIC_API XXH_errorcode
2019-06-14 01:22:46 +00:00
XXH3_64bits_reset ( XXH3_state_t * statePtr )
2019-05-05 03:10:52 +00:00
{
2019-06-14 01:22:46 +00:00
if ( statePtr = = NULL ) return XXH_ERROR ;
2019-06-14 20:08:17 +00:00
XXH3_64bits_reset_internal ( statePtr , 0 , kSecret , XXH_SECRET_DEFAULT_SIZE ) ;
2019-06-14 01:22:46 +00:00
return XXH_OK ;
}
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSecret ( XXH3_state_t * statePtr , const void * secret , size_t secretSize )
{
if ( statePtr = = NULL ) return XXH_ERROR ;
2019-10-01 22:52:21 +00:00
XXH3_64bits_reset_internal ( statePtr , 0 , ( const BYTE * ) secret , secretSize ) ;
2019-06-14 01:22:46 +00:00
if ( secret = = NULL ) return XXH_ERROR ;
2019-06-17 21:16:52 +00:00
if ( secretSize < XXH3_SECRET_SIZE_MIN ) return XXH_ERROR ;
2019-06-14 01:22:46 +00:00
return XXH_OK ;
}
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSeed ( XXH3_state_t * statePtr , XXH64_hash_t seed )
{
if ( statePtr = = NULL ) return XXH_ERROR ;
2019-06-14 18:34:24 +00:00
XXH3_64bits_reset_internal ( statePtr , seed , kSecret , XXH_SECRET_DEFAULT_SIZE ) ;
2019-10-02 16:28:01 +00:00
XXH3_initCustomSecret ( statePtr - > customSecret , seed ) ;
2019-06-14 18:34:24 +00:00
statePtr - > secret = statePtr - > customSecret ;
2019-05-05 03:10:52 +00:00
return XXH_OK ;
}
2019-07-20 00:28:09 +00:00
XXH_FORCE_INLINE void
XXH3_consumeStripes ( U64 * acc ,
2019-10-01 22:52:21 +00:00
XXH32_hash_t * nbStripesSoFarPtr , XXH32_hash_t nbStripesPerBlock ,
2019-10-02 16:28:01 +00:00
const BYTE * input , size_t totalStripes ,
2019-10-01 22:52:21 +00:00
const BYTE * secret , size_t secretLimit ,
XXH3_accWidth_e accWidth )
2019-06-14 01:22:46 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( * nbStripesSoFarPtr < nbStripesPerBlock ) ;
2019-06-14 01:22:46 +00:00
if ( nbStripesPerBlock - * nbStripesSoFarPtr < = totalStripes ) {
/* need a scrambling operation */
size_t const nbStripes = nbStripesPerBlock - * nbStripesSoFarPtr ;
2019-10-02 16:28:01 +00:00
XXH3_accumulate ( acc , input , secret + nbStripesSoFarPtr [ 0 ] * XXH_SECRET_CONSUME_RATE , nbStripes , accWidth ) ;
2019-10-01 22:52:21 +00:00
XXH3_scrambleAcc ( acc , secret + secretLimit ) ;
2019-10-02 16:28:01 +00:00
XXH3_accumulate ( acc , input + nbStripes * STRIPE_LEN , secret , totalStripes - nbStripes , accWidth ) ;
2019-06-14 01:22:46 +00:00
* nbStripesSoFarPtr = ( XXH32_hash_t ) ( totalStripes - nbStripes ) ;
} else {
2019-10-02 16:28:01 +00:00
XXH3_accumulate ( acc , input , secret + nbStripesSoFarPtr [ 0 ] * XXH_SECRET_CONSUME_RATE , totalStripes , accWidth ) ;
2019-06-14 18:34:24 +00:00
* nbStripesSoFarPtr + = ( XXH32_hash_t ) totalStripes ;
2019-06-14 01:22:46 +00:00
}
}
2019-07-20 00:28:09 +00:00
XXH_FORCE_INLINE XXH_errorcode
2019-10-01 22:52:21 +00:00
XXH3_update ( XXH3_state_t * state , const BYTE * input , size_t len , XXH3_accWidth_e accWidth )
2019-05-05 03:10:52 +00:00
{
if ( input = = NULL )
# if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
return XXH_OK ;
# else
return XXH_ERROR ;
# endif
2019-10-01 22:52:21 +00:00
{ const BYTE * const bEnd = input + len ;
2019-05-05 03:10:52 +00:00
2019-06-14 01:22:46 +00:00
state - > totalLen + = len ;
2019-05-05 03:10:52 +00:00
2019-06-13 23:19:51 +00:00
if ( state - > bufferedSize + len < = XXH3_INTERNALBUFFER_SIZE ) { /* fill in tmp buffer */
2019-06-14 01:22:46 +00:00
XXH_memcpy ( state - > buffer + state - > bufferedSize , input , len ) ;
2019-05-05 03:10:52 +00:00
state - > bufferedSize + = ( XXH32_hash_t ) len ;
return XXH_OK ;
}
2019-06-14 01:22:46 +00:00
/* input now > XXH3_INTERNALBUFFER_SIZE */
2019-06-13 23:19:51 +00:00
2019-06-14 01:22:46 +00:00
# define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
2019-06-13 23:19:51 +00:00
XXH_STATIC_ASSERT ( XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN = = 0 ) ; /* clean multiple */
2019-10-02 16:28:01 +00:00
if ( state - > bufferedSize ) { /* some input within internal buffer: fill then consume it */
2019-06-13 23:19:51 +00:00
size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state - > bufferedSize ;
XXH_memcpy ( state - > buffer + state - > bufferedSize , input , loadSize ) ;
2019-10-01 22:52:21 +00:00
input + = loadSize ;
2019-07-20 00:28:09 +00:00
XXH3_consumeStripes ( state - > acc ,
& state - > nbStripesSoFar , state - > nbStripesPerBlock ,
state - > buffer , XXH3_INTERNALBUFFER_STRIPES ,
state - > secret , state - > secretLimit ,
accWidth ) ;
2019-07-03 21:23:43 +00:00
state - > bufferedSize = 0 ;
2019-06-14 01:22:46 +00:00
}
2019-05-05 03:10:52 +00:00
2019-06-13 23:19:51 +00:00
/* consume input by full buffer quantities */
2019-10-01 22:52:21 +00:00
if ( input + XXH3_INTERNALBUFFER_SIZE < = bEnd ) {
2019-06-13 23:19:51 +00:00
const BYTE * const limit = bEnd - XXH3_INTERNALBUFFER_SIZE ;
2019-05-05 03:10:52 +00:00
do {
2019-07-20 00:28:09 +00:00
XXH3_consumeStripes ( state - > acc ,
& state - > nbStripesSoFar , state - > nbStripesPerBlock ,
2019-10-01 22:52:21 +00:00
input , XXH3_INTERNALBUFFER_STRIPES ,
2019-07-20 00:28:09 +00:00
state - > secret , state - > secretLimit ,
accWidth ) ;
2019-10-01 22:52:21 +00:00
input + = XXH3_INTERNALBUFFER_SIZE ;
} while ( input < = limit ) ;
2019-05-05 03:10:52 +00:00
}
2019-10-02 16:28:01 +00:00
if ( input < bEnd ) { /* some remaining input input : buffer it */
2019-10-01 22:52:21 +00:00
XXH_memcpy ( state - > buffer , input , ( size_t ) ( bEnd - input ) ) ;
state - > bufferedSize = ( XXH32_hash_t ) ( bEnd - input ) ;
2019-05-05 03:10:52 +00:00
}
}
return XXH_OK ;
}
2019-07-20 00:28:09 +00:00
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_update ( XXH3_state_t * state , const void * input , size_t len )
{
2019-10-01 22:52:21 +00:00
return XXH3_update ( state , ( const BYTE * ) input , len , XXH3_acc_64bits ) ;
2019-07-20 00:28:09 +00:00
}
XXH_FORCE_INLINE void
XXH3_digest_long ( XXH64_hash_t * acc , const XXH3_state_t * state , XXH3_accWidth_e accWidth )
{
2019-10-02 16:28:01 +00:00
memcpy ( acc , state - > acc , sizeof ( state - > acc ) ) ; /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
2019-07-20 00:28:09 +00:00
if ( state - > bufferedSize > = STRIPE_LEN ) {
size_t const totalNbStripes = state - > bufferedSize / STRIPE_LEN ;
XXH32_hash_t nbStripesSoFar = state - > nbStripesSoFar ;
XXH3_consumeStripes ( acc ,
& nbStripesSoFar , state - > nbStripesPerBlock ,
state - > buffer , totalNbStripes ,
state - > secret , state - > secretLimit ,
accWidth ) ;
if ( state - > bufferedSize % STRIPE_LEN ) { /* one last partial stripe */
XXH3_accumulate_512 ( acc ,
state - > buffer + state - > bufferedSize - STRIPE_LEN ,
2019-10-01 22:52:21 +00:00
state - > secret + state - > secretLimit - XXH_SECRET_LASTACC_START ,
2019-07-20 00:28:09 +00:00
accWidth ) ;
}
} else { /* bufferedSize < STRIPE_LEN */
if ( state - > bufferedSize ) { /* one last stripe */
2019-10-01 22:52:21 +00:00
BYTE lastStripe [ STRIPE_LEN ] ;
2019-07-20 00:28:09 +00:00
size_t const catchupSize = STRIPE_LEN - state - > bufferedSize ;
2019-10-01 22:52:21 +00:00
memcpy ( lastStripe , state - > buffer + sizeof ( state - > buffer ) - catchupSize , catchupSize ) ;
2019-07-20 00:28:09 +00:00
memcpy ( lastStripe + catchupSize , state - > buffer , state - > bufferedSize ) ;
XXH3_accumulate_512 ( acc ,
lastStripe ,
2019-10-01 22:52:21 +00:00
state - > secret + state - > secretLimit - XXH_SECRET_LASTACC_START ,
2019-07-20 00:28:09 +00:00
accWidth ) ;
} }
}
2019-05-05 03:10:52 +00:00
2019-06-13 23:19:51 +00:00
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest ( const XXH3_state_t * state )
2019-05-05 03:10:52 +00:00
{
2019-06-17 21:16:52 +00:00
if ( state - > totalLen > XXH3_MIDSIZE_MAX ) {
2019-06-13 23:19:51 +00:00
XXH_ALIGN ( XXH_ACC_ALIGN ) XXH64_hash_t acc [ ACC_NB ] ;
2019-07-20 00:28:09 +00:00
XXH3_digest_long ( acc , state , XXH3_acc_64bits ) ;
2019-10-01 22:52:21 +00:00
return XXH3_mergeAccs ( acc , state - > secret + XXH_SECRET_MERGEACCS_START , ( U64 ) state - > totalLen * PRIME64_1 ) ;
2019-05-05 03:10:52 +00:00
}
2019-06-17 21:16:52 +00:00
/* len <= XXH3_MIDSIZE_MAX : short code */
2019-06-14 19:26:33 +00:00
if ( state - > seed )
return XXH3_64bits_withSeed ( state - > buffer , ( size_t ) state - > totalLen , state - > seed ) ;
return XXH3_64bits_withSecret ( state - > buffer , ( size_t ) ( state - > totalLen ) , state - > secret , state - > secretLimit + STRIPE_LEN ) ;
2019-05-05 03:10:52 +00:00
}
2019-03-07 04:42:04 +00:00
/* ==========================================
2019-03-08 20:37:06 +00:00
* XXH3 128 bits ( = > XXH128 )
2019-03-07 04:42:04 +00:00
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
XXH_FORCE_INLINE XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_1to3_128b ( const BYTE * input , size_t len , const BYTE * secret , XXH64_hash_t seed )
2019-03-07 04:42:04 +00:00
{
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( input ! = NULL ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( 1 < = len & & len < = 3 ) ;
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( secret ! = NULL ) ;
{ BYTE const c1 = input [ 0 ] ;
BYTE const c2 = input [ len > > 1 ] ;
BYTE const c3 = input [ len - 1 ] ;
2019-07-18 22:14:44 +00:00
U32 const combinedl = ( ( U32 ) c1 ) + ( ( ( U32 ) c2 ) < < 8 ) + ( ( ( U32 ) c3 ) < < 16 ) + ( ( ( U32 ) len ) < < 24 ) ;
U32 const combinedh = XXH_swap32 ( combinedl ) ;
2019-10-02 16:28:01 +00:00
U64 const keyed_lo = ( U64 ) combinedl ^ ( XXH_readLE32 ( secret ) + seed ) ;
U64 const keyed_hi = ( U64 ) combinedh ^ ( XXH_readLE32 ( secret + 4 ) - seed ) ;
U64 const mixedl = keyed_lo * PRIME64_1 ;
U64 const mixedh = keyed_hi * PRIME64_5 ;
2019-07-18 22:14:44 +00:00
XXH128_hash_t const h128 = { XXH3_avalanche ( mixedl ) /*low64*/ , XXH3_avalanche ( mixedh ) /*high64*/ } ;
2019-03-16 13:59:46 +00:00
return h128 ;
2019-03-07 04:42:04 +00:00
}
}
XXH_FORCE_INLINE XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_4to8_128b ( const BYTE * input , size_t len , const BYTE * secret , XXH64_hash_t seed )
2019-03-07 04:42:04 +00:00
{
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( input ! = NULL ) ;
XXH_ASSERT ( secret ! = NULL ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( 4 < = len & & len < = 8 ) ;
2019-10-02 16:28:01 +00:00
{ U32 const input_lo = XXH_readLE32 ( input ) ;
U32 const input_hi = XXH_readLE32 ( input + len - 4 ) ;
U64 const input_64_lo = input_lo + ( ( U64 ) input_hi < < 32 ) ;
U64 const input_64_hi = XXH_swap64 ( input_64_lo ) ;
U64 const keyed_lo = input_64_lo ^ ( XXH_readLE64 ( secret ) + seed ) ;
U64 const keyed_hi = input_64_hi ^ ( XXH_readLE64 ( secret + 8 ) - seed ) ;
U64 const mix64l1 = len + ( ( keyed_lo ^ ( keyed_lo > > 51 ) ) * PRIME32_1 ) ;
2019-07-18 22:14:44 +00:00
U64 const mix64l2 = ( mix64l1 ^ ( mix64l1 > > 47 ) ) * PRIME64_2 ;
2019-10-02 16:28:01 +00:00
U64 const mix64h1 = ( ( keyed_hi ^ ( keyed_hi > > 47 ) ) * PRIME64_1 ) - len ;
2019-07-18 22:14:44 +00:00
U64 const mix64h2 = ( mix64h1 ^ ( mix64h1 > > 43 ) ) * PRIME64_4 ;
{ XXH128_hash_t const h128 = { XXH3_avalanche ( mix64l2 ) /*low64*/ , XXH3_avalanche ( mix64h2 ) /*high64*/ } ;
2019-03-16 13:59:46 +00:00
return h128 ;
2019-07-18 22:14:44 +00:00
} }
}
2019-03-07 04:42:04 +00:00
XXH_FORCE_INLINE XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_9to16_128b ( const BYTE * input , size_t len , const BYTE * secret , XXH64_hash_t seed )
2019-03-07 04:42:04 +00:00
{
2019-10-02 16:28:01 +00:00
XXH_ASSERT ( input ! = NULL ) ;
XXH_ASSERT ( secret ! = NULL ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( 9 < = len & & len < = 16 ) ;
2019-10-02 17:01:51 +00:00
{ U64 const input_lo = XXH_readLE64 ( input ) ^ ( XXH_readLE64 ( secret ) + seed ) ;
U64 const input_hi = XXH_readLE64 ( input + len - 8 ) ^ ( XXH_readLE64 ( secret + 8 ) - seed ) ;
XXH128_hash_t m128 = XXH_mult64to128 ( input_lo ^ input_hi , PRIME64_1 ) ;
2019-10-02 16:28:01 +00:00
U64 const lenContrib = XXH_mult32to64 ( len , PRIME32_5 ) ;
m128 . low64 + = lenContrib ;
m128 . high64 + = input_hi * PRIME64_1 ;
2019-07-18 22:14:44 +00:00
m128 . low64 ^ = ( m128 . high64 > > 32 ) ;
Better 128-bit multiply, multiple bugfixes.
Sorry about the disorganized commit. :(
Yet again, I had to fix ARMv6. Clang went from ldm to ldrd which
also bus errors.
Therefore, I decided to fix the root problem and remove the
XXH_FORCE_DIRECT_MEMORY_ACCESS hack, using only memcpy.
This will kill alignment memes for good, and besides, it didn't
seem to make much of a difference.
Additionally, I added my better 128-bit long multiply
and applied DRY to XXH3_mul128_fold64. This also removes
the cryptic inline assembly hack.
Each method was documented, too (we need more comments).
Also, I added a warning for users who are compiling Thumb-1
code for a target supporting ARM instructions.
While all versions of ARM and Thumb-2 meet XXH3's base requirements,
Thumb-1 does not.
First of all, UMULL is inaccessible in the 16-bit subset. This means
that every XXH_mult32to64 means a call to __aeabi_lmul.
Since everything operation in XXH3 needs to happen in the Lo registers
plus having to setup r0-r3 many times for __aeabi_lmul, the output
resembles a game of Rush Hour:
$ clang -O3 -S --target=arm-none-eabi -march=armv4t -mthumb xxhash.c
$ grep -c mov xxhash.s
5472
$ clang -O3 -S --target=arm-none-eabi -march=armv4t xxhash.c
$ grep -c mov xxhash.s
2071
It is much more practical to compile xxHash with the wider instruction
sets, as these restrictions do not apply.
This doesn't warn if ARMv6-M is targeted; Thumb-1 is unavoidable.
Lastly, I removed the pragma clang loop hack which didn't work anymore
since the number of iterations can't be constant evaluated. Now, we
don't have 20 warnings when compiling for x86.
2019-09-16 14:09:00 +00:00
{ XXH128_hash_t h128 = XXH_mult64to128 ( m128 . low64 , PRIME64_2 ) ;
2019-07-18 22:14:44 +00:00
h128 . high64 + = m128 . high64 * PRIME64_2 ;
h128 . low64 = XXH3_avalanche ( h128 . low64 ) ;
h128 . high64 = XXH3_avalanche ( h128 . high64 ) ;
2019-03-16 13:59:46 +00:00
return h128 ;
2019-07-18 22:14:44 +00:00
} }
2019-03-07 04:42:04 +00:00
}
2019-07-18 22:14:44 +00:00
/* Assumption : `secret` size is >= 16
* Note : it should be > = XXH3_SECRET_SIZE_MIN anyway */
2019-03-07 04:42:04 +00:00
XXH_FORCE_INLINE XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_0to16_128b ( const BYTE * input , size_t len , const BYTE * secret , XXH64_hash_t seed )
2019-03-07 04:42:04 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( len < = 16 ) ;
2019-10-02 16:28:01 +00:00
{ if ( len > 8 ) return XXH3_len_9to16_128b ( input , len , secret , seed ) ;
if ( len > = 4 ) return XXH3_len_4to8_128b ( input , len , secret , seed ) ;
if ( len ) return XXH3_len_1to3_128b ( input , len , secret , seed ) ;
2019-07-18 22:14:44 +00:00
{ XXH128_hash_t const h128 = { 0 , 0 } ;
2019-03-16 13:59:46 +00:00
return h128 ;
2019-07-19 23:21:17 +00:00
} }
2019-03-07 04:42:04 +00:00
}
2019-07-18 22:14:44 +00:00
XXH_FORCE_INLINE XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_hashLong_128b_internal ( const BYTE * XXH_RESTRICT input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret , size_t secretSize )
2019-03-08 20:37:06 +00:00
{
2019-07-20 00:28:09 +00:00
XXH_ALIGN ( XXH_ACC_ALIGN ) U64 acc [ ACC_NB ] = XXH3_INIT_ACC ;
2019-03-08 20:37:06 +00:00
2019-10-02 16:28:01 +00:00
XXH3_hashLong_internal_loop ( acc , input , len , secret , secretSize , XXH3_acc_128bits ) ;
2019-03-08 20:37:06 +00:00
/* converge into final hash */
2019-07-18 22:14:44 +00:00
XXH_STATIC_ASSERT ( sizeof ( acc ) = = 64 ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = sizeof ( acc ) + XXH_SECRET_MERGEACCS_START ) ;
2019-10-01 22:52:21 +00:00
{ U64 const low64 = XXH3_mergeAccs ( acc , secret + XXH_SECRET_MERGEACCS_START , ( U64 ) len * PRIME64_1 ) ;
U64 const high64 = XXH3_mergeAccs ( acc , secret + secretSize - sizeof ( acc ) - XXH_SECRET_MERGEACCS_START , ~ ( ( U64 ) len * PRIME64_2 ) ) ;
2019-03-16 13:59:46 +00:00
XXH128_hash_t const h128 = { low64 , high64 } ;
return h128 ;
2019-03-08 20:37:06 +00:00
}
}
2019-07-18 22:14:44 +00:00
XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
2019-10-02 16:28:01 +00:00
XXH3_hashLong_128b_defaultSecret ( const BYTE * input , size_t len )
2019-07-18 22:14:44 +00:00
{
2019-10-02 16:28:01 +00:00
return XXH3_hashLong_128b_internal ( input , len , kSecret , sizeof ( kSecret ) ) ;
2019-07-18 22:14:44 +00:00
}
XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
2019-10-02 16:28:01 +00:00
XXH3_hashLong_128b_withSecret ( const BYTE * input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * secret , size_t secretSize )
2019-03-07 04:42:04 +00:00
{
2019-10-02 16:28:01 +00:00
return XXH3_hashLong_128b_internal ( input , len , secret , secretSize ) ;
2019-07-18 22:14:44 +00:00
}
XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
2019-10-02 16:28:01 +00:00
XXH3_hashLong_128b_withSeed ( const BYTE * input , size_t len , XXH64_hash_t seed )
2019-07-18 22:14:44 +00:00
{
2019-10-01 22:52:21 +00:00
XXH_ALIGN ( 8 ) BYTE secret [ XXH_SECRET_DEFAULT_SIZE ] ;
2019-10-02 16:28:01 +00:00
if ( seed = = 0 ) return XXH3_hashLong_128b_defaultSecret ( input , len ) ;
XXH3_initCustomSecret ( secret , seed ) ;
return XXH3_hashLong_128b_internal ( input , len , secret , sizeof ( secret ) ) ;
2019-07-18 22:14:44 +00:00
}
2019-03-07 04:42:04 +00:00
2019-10-01 05:36:07 +00:00
XXH_FORCE_INLINE XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH128_mix32B ( XXH128_hash_t acc , const BYTE * input_1 , const BYTE * input_2 , const BYTE * secret , XXH64_hash_t seed )
2019-10-01 05:36:07 +00:00
{
2019-10-02 16:28:01 +00:00
acc . low64 + = XXH3_mix16B ( input_1 , secret + 0 , seed ) ;
acc . low64 ^ = XXH_readLE64 ( input_2 ) + XXH_readLE64 ( input_2 + 8 ) ;
acc . high64 + = XXH3_mix16B ( input_2 , secret + 16 , seed ) ;
acc . high64 ^ = XXH_readLE64 ( input_1 ) + XXH_readLE64 ( input_1 + 8 ) ;
2019-10-01 05:36:07 +00:00
return acc ;
}
2019-07-19 23:44:53 +00:00
XXH_NO_INLINE XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_129to240_128b ( const BYTE * XXH_RESTRICT input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret , size_t secretSize ,
XXH64_hash_t seed )
2019-07-19 23:44:53 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = XXH3_SECRET_SIZE_MIN ) ; ( void ) secretSize ;
XXH_ASSERT ( 128 < len & & len < = XXH3_MIDSIZE_MAX ) ;
2019-07-19 23:44:53 +00:00
2019-10-01 05:36:07 +00:00
{ XXH128_hash_t acc ;
2019-07-19 23:44:53 +00:00
int const nbRounds = ( int ) len / 32 ;
int i ;
2019-10-01 05:36:07 +00:00
acc . low64 = len * PRIME64_1 ;
acc . high64 = 0 ;
2019-07-19 23:44:53 +00:00
for ( i = 0 ; i < 4 ; i + + ) {
2019-10-02 16:28:01 +00:00
acc = XXH128_mix32B ( acc , input + ( 32 * i ) , input + ( 32 * i ) + 16 , secret + ( 32 * i ) , seed ) ;
2019-07-19 23:44:53 +00:00
}
2019-10-01 05:36:07 +00:00
acc . low64 = XXH3_avalanche ( acc . low64 ) ;
acc . high64 = XXH3_avalanche ( acc . high64 ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( nbRounds > = 4 ) ;
2019-07-19 23:44:53 +00:00
for ( i = 4 ; i < nbRounds ; i + + ) {
2019-10-02 16:28:01 +00:00
acc = XXH128_mix32B ( acc , input + ( 32 * i ) , input + ( 32 * i ) + 16 , secret + XXH3_MIDSIZE_STARTOFFSET + ( 32 * ( i - 4 ) ) , seed ) ;
2019-07-19 23:44:53 +00:00
}
/* last bytes */
2019-10-02 16:28:01 +00:00
acc = XXH128_mix32B ( acc , input + len - 16 , input + len - 32 , secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16 , 0ULL - seed ) ;
2019-07-19 23:44:53 +00:00
2019-10-01 05:36:07 +00:00
{ U64 const low64 = acc . low64 + acc . high64 ;
U64 const high64 = ( acc . low64 * PRIME64_1 ) + ( acc . high64 * PRIME64_4 ) + ( ( len - seed ) * PRIME64_2 ) ;
2019-07-19 23:44:53 +00:00
XXH128_hash_t const h128 = { XXH3_avalanche ( low64 ) , ( XXH64_hash_t ) 0 - XXH3_avalanche ( high64 ) } ;
return h128 ;
}
}
}
2019-07-18 22:14:44 +00:00
2019-10-01 00:33:38 +00:00
2019-07-18 22:14:44 +00:00
XXH_FORCE_INLINE XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_len_17to128_128b ( const BYTE * XXH_RESTRICT input , size_t len ,
2019-10-01 22:52:21 +00:00
const BYTE * XXH_RESTRICT secret , size_t secretSize ,
XXH64_hash_t seed )
2019-07-18 22:14:44 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = XXH3_SECRET_SIZE_MIN ) ; ( void ) secretSize ;
XXH_ASSERT ( 16 < len & & len < = 128 ) ;
2019-07-18 22:14:44 +00:00
2019-10-01 00:55:46 +00:00
{ XXH128_hash_t acc ;
acc . low64 = len * PRIME64_1 ;
acc . high64 = 0 ;
2019-03-07 04:42:04 +00:00
if ( len > 32 ) {
if ( len > 64 ) {
if ( len > 96 ) {
2019-10-02 16:28:01 +00:00
acc = XXH128_mix32B ( acc , input + 48 , input + len - 64 , secret + 96 , seed ) ;
2019-03-07 04:42:04 +00:00
}
2019-10-02 16:28:01 +00:00
acc = XXH128_mix32B ( acc , input + 32 , input + len - 48 , secret + 64 , seed ) ;
2019-03-07 04:42:04 +00:00
}
2019-10-02 16:28:01 +00:00
acc = XXH128_mix32B ( acc , input + 16 , input + len - 32 , secret + 32 , seed ) ;
2019-03-07 04:42:04 +00:00
}
2019-10-02 16:28:01 +00:00
acc = XXH128_mix32B ( acc , input , input + len - 16 , secret , seed ) ;
2019-10-01 00:55:46 +00:00
{ U64 const low64 = acc . low64 + acc . high64 ;
U64 const high64 = ( acc . low64 * PRIME64_1 ) + ( acc . high64 * PRIME64_4 ) + ( ( len - seed ) * PRIME64_2 ) ;
2019-07-18 22:14:44 +00:00
XXH128_hash_t const h128 = { XXH3_avalanche ( low64 ) , ( XXH64_hash_t ) 0 - XXH3_avalanche ( high64 ) } ;
2019-03-16 13:59:46 +00:00
return h128 ;
2019-03-08 20:37:06 +00:00
}
2019-03-07 04:42:04 +00:00
}
}
2019-10-02 16:28:01 +00:00
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits ( const void * input , size_t len )
2019-03-07 04:42:04 +00:00
{
2019-10-02 16:28:01 +00:00
if ( len < = 16 ) return XXH3_len_0to16_128b ( ( const BYTE * ) input , len , kSecret , 0 ) ;
if ( len < = 128 ) return XXH3_len_17to128_128b ( ( const BYTE * ) input , len , kSecret , sizeof ( kSecret ) , 0 ) ;
if ( len < = XXH3_MIDSIZE_MAX ) return XXH3_len_129to240_128b ( ( const BYTE * ) input , len , kSecret , sizeof ( kSecret ) , 0 ) ;
return XXH3_hashLong_128b_defaultSecret ( ( const BYTE * ) input , len ) ;
2019-03-07 04:42:04 +00:00
}
2019-07-18 22:14:44 +00:00
XXH_PUBLIC_API XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_128bits_withSecret ( const void * input , size_t len , const void * secret , size_t secretSize )
2019-07-18 22:14:44 +00:00
{
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( secretSize > = XXH3_SECRET_SIZE_MIN ) ;
2019-07-18 22:14:44 +00:00
/* if an action must be taken should `secret` conditions not be respected,
* it should be done here .
* For now , it ' s a contract pre - condition .
* Adding a check and a branch here would cost performance at every hash */
2019-10-02 16:28:01 +00:00
if ( len < = 16 ) return XXH3_len_0to16_128b ( ( const BYTE * ) input , len , ( const BYTE * ) secret , 0 ) ;
if ( len < = 128 ) return XXH3_len_17to128_128b ( ( const BYTE * ) input , len , ( const BYTE * ) secret , secretSize , 0 ) ;
if ( len < = XXH3_MIDSIZE_MAX ) return XXH3_len_129to240_128b ( ( const BYTE * ) input , len , ( const BYTE * ) secret , secretSize , 0 ) ;
return XXH3_hashLong_128b_withSecret ( ( const BYTE * ) input , len , ( const BYTE * ) secret , secretSize ) ;
2019-07-18 22:14:44 +00:00
}
XXH_PUBLIC_API XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH3_128bits_withSeed ( const void * input , size_t len , XXH64_hash_t seed )
2019-07-18 22:14:44 +00:00
{
2019-10-02 16:28:01 +00:00
if ( len < = 16 ) return XXH3_len_0to16_128b ( ( const BYTE * ) input , len , kSecret , seed ) ;
if ( len < = 128 ) return XXH3_len_17to128_128b ( ( const BYTE * ) input , len , kSecret , sizeof ( kSecret ) , seed ) ;
if ( len < = XXH3_MIDSIZE_MAX ) return XXH3_len_129to240_128b ( ( const BYTE * ) input , len , kSecret , sizeof ( kSecret ) , seed ) ;
return XXH3_hashLong_128b_withSeed ( ( const BYTE * ) input , len , seed ) ;
2019-07-18 22:14:44 +00:00
}
2019-03-07 04:42:04 +00:00
2019-07-18 22:14:44 +00:00
XXH_PUBLIC_API XXH128_hash_t
2019-10-02 16:28:01 +00:00
XXH128 ( const void * input , size_t len , XXH64_hash_t seed )
2019-03-07 04:42:04 +00:00
{
2019-10-02 16:28:01 +00:00
return XXH3_128bits_withSeed ( input , len , seed ) ;
2019-03-07 04:42:04 +00:00
}
2019-07-18 22:14:44 +00:00
2019-07-03 15:53:41 +00:00
/* === XXH3 128-bit streaming === */
/* all the functions are actually the same as for 64-bit streaming variant,
just the reset one is different ( different initial acc values for 0 , 5 , 6 , 7 ) ,
and near the end of the digest function */
static void
XXH3_128bits_reset_internal ( XXH3_state_t * statePtr ,
XXH64_hash_t seed ,
2019-10-01 22:52:21 +00:00
const BYTE * secret , size_t secretSize )
2019-07-03 15:53:41 +00:00
{
2019-07-20 00:28:09 +00:00
XXH3_64bits_reset_internal ( statePtr , seed , secret , secretSize ) ;
2019-07-03 15:53:41 +00:00
}
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset ( XXH3_state_t * statePtr )
{
if ( statePtr = = NULL ) return XXH_ERROR ;
XXH3_128bits_reset_internal ( statePtr , 0 , kSecret , XXH_SECRET_DEFAULT_SIZE ) ;
return XXH_OK ;
}
2019-07-20 00:28:09 +00:00
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSecret ( XXH3_state_t * statePtr , const void * secret , size_t secretSize )
{
if ( statePtr = = NULL ) return XXH_ERROR ;
2019-10-01 22:52:21 +00:00
XXH3_128bits_reset_internal ( statePtr , 0 , ( const BYTE * ) secret , secretSize ) ;
2019-07-20 00:28:09 +00:00
if ( secret = = NULL ) return XXH_ERROR ;
if ( secretSize < XXH3_SECRET_SIZE_MIN ) return XXH_ERROR ;
return XXH_OK ;
}
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSeed ( XXH3_state_t * statePtr , XXH64_hash_t seed )
{
if ( statePtr = = NULL ) return XXH_ERROR ;
XXH3_128bits_reset_internal ( statePtr , seed , kSecret , XXH_SECRET_DEFAULT_SIZE ) ;
2019-10-02 16:28:01 +00:00
XXH3_initCustomSecret ( statePtr - > customSecret , seed ) ;
2019-07-20 00:28:09 +00:00
statePtr - > secret = statePtr - > customSecret ;
return XXH_OK ;
}
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_update ( XXH3_state_t * state , const void * input , size_t len )
{
2019-10-01 22:52:21 +00:00
return XXH3_update ( state , ( const BYTE * ) input , len , XXH3_acc_128bits ) ;
2019-07-20 00:28:09 +00:00
}
2019-07-03 15:53:41 +00:00
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest ( const XXH3_state_t * state )
{
if ( state - > totalLen > XXH3_MIDSIZE_MAX ) {
XXH_ALIGN ( XXH_ACC_ALIGN ) XXH64_hash_t acc [ ACC_NB ] ;
2019-07-20 00:28:09 +00:00
XXH3_digest_long ( acc , state , XXH3_acc_128bits ) ;
2019-08-05 14:44:18 +00:00
XXH_ASSERT ( state - > secretLimit + STRIPE_LEN > = sizeof ( acc ) + XXH_SECRET_MERGEACCS_START ) ;
2019-10-01 22:52:21 +00:00
{ U64 const low64 = XXH3_mergeAccs ( acc , state - > secret + XXH_SECRET_MERGEACCS_START , ( U64 ) state - > totalLen * PRIME64_1 ) ;
U64 const high64 = XXH3_mergeAccs ( acc , state - > secret + state - > secretLimit + STRIPE_LEN - sizeof ( acc ) - XXH_SECRET_MERGEACCS_START , ~ ( ( U64 ) state - > totalLen * PRIME64_2 ) ) ;
2019-07-03 16:00:52 +00:00
XXH128_hash_t const h128 = { low64 , high64 } ;
return h128 ;
}
2019-07-03 15:53:41 +00:00
}
/* len <= XXH3_MIDSIZE_MAX : short code */
if ( state - > seed )
return XXH3_128bits_withSeed ( state - > buffer , ( size_t ) state - > totalLen , state - > seed ) ;
2019-07-19 23:44:53 +00:00
return XXH3_128bits_withSecret ( state - > buffer , ( size_t ) ( state - > totalLen ) , state - > secret , state - > secretLimit + STRIPE_LEN ) ;
2019-07-03 15:53:41 +00:00
}
2019-07-23 22:33:58 +00:00
/* 128-bit utility functions */
# include <string.h> /* memcmp */
/* return : 1 is equal, 0 if different */
XXH_PUBLIC_API int XXH128_isEqual ( XXH128_hash_t h1 , XXH128_hash_t h2 )
{
/* note : XXH128_hash_t is compact, it has no padding byte */
return ! ( memcmp ( & h1 , & h2 , sizeof ( h1 ) ) ) ;
}
/* This prototype is compatible with stdlib's qsort().
* return : > 0 if * h128_1 > * h128_2
* < 0 if * h128_1 < * h128_2
* = 0 if * h128_1 = = * h128_2 */
XXH_PUBLIC_API int XXH128_cmp ( const void * h128_1 , const void * h128_2 )
{
XXH128_hash_t const h1 = * ( const XXH128_hash_t * ) h128_1 ;
XXH128_hash_t const h2 = * ( const XXH128_hash_t * ) h128_2 ;
int const hcmp = ( h1 . high64 > h2 . high64 ) - ( h2 . high64 > h1 . high64 ) ;
/* note : bets that, in most cases, hash values are different */
if ( hcmp ) return hcmp ;
return ( h1 . low64 > h2 . low64 ) - ( h2 . low64 > h1 . low64 ) ;
}
2019-07-23 22:49:54 +00:00
/*====== Canonical representation ======*/
XXH_PUBLIC_API void
XXH128_canonicalFromHash ( XXH128_canonical_t * dst , XXH128_hash_t hash )
{
XXH_STATIC_ASSERT ( sizeof ( XXH128_canonical_t ) = = sizeof ( XXH128_hash_t ) ) ;
if ( XXH_CPU_LITTLE_ENDIAN ) {
hash . high64 = XXH_swap64 ( hash . high64 ) ;
hash . low64 = XXH_swap64 ( hash . low64 ) ;
}
memcpy ( dst , & hash . high64 , sizeof ( hash . high64 ) ) ;
memcpy ( ( char * ) dst + sizeof ( hash . high64 ) , & hash . low64 , sizeof ( hash . low64 ) ) ;
}
XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical ( const XXH128_canonical_t * src )
{
XXH128_hash_t h ;
h . high64 = XXH_readBE64 ( src ) ;
h . low64 = XXH_readBE64 ( src - > digest + 8 ) ;
return h ;
}
2019-07-03 15:53:41 +00:00
2019-02-26 20:36:23 +00:00
# endif /* XXH3_H */