mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-26 19:30:21 +00:00
Add LSH dynamic dispatch (PR #1032)
This commit adds dynamic dispatch to LSH. The implementation pivots on AVX2 and SSSE3.
This commit is contained in:
parent
21a40abc5c
commit
a0e21c77ae
@ -205,7 +205,11 @@ lea.cpp
|
||||
lea_simd.cpp
|
||||
lea.h
|
||||
lsh256.cpp
|
||||
lsh256_sse.cpp
|
||||
lsh256_avx.cpp
|
||||
lsh512.cpp
|
||||
lsh512_sse.cpp
|
||||
lsh512_avx.cpp
|
||||
lsh.h
|
||||
luc.cpp
|
||||
luc.h
|
||||
|
38
GNUmakefile
38
GNUmakefile
@ -122,7 +122,9 @@ else ifeq ($(findstring clean,$(MAKECMDGOALS)),clean)
|
||||
DETECT_FEATURES := 0
|
||||
else ifeq ($(findstring distclean,$(MAKECMDGOALS)),distclean)
|
||||
DETECT_FEATURES := 0
|
||||
else ifeq ($(findstring distclean,$(MAKECMDGOALS)),trim)
|
||||
else ifeq ($(findstring trim,$(MAKECMDGOALS)),trim)
|
||||
DETECT_FEATURES := 0
|
||||
else ifeq ($(findstring zip,$(MAKECMDGOALS)),zip)
|
||||
DETECT_FEATURES := 0
|
||||
endif
|
||||
|
||||
@ -230,7 +232,7 @@ endif # IS_MINGW
|
||||
|
||||
# Newlib needs _XOPEN_SOURCE=600 for signals
|
||||
TPROG = TestPrograms/test_newlib.cpp
|
||||
HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
ifeq ($(findstring -D_XOPEN_SOURCE,$(CXXFLAGS)),)
|
||||
CRYPTOPP_CXXFLAGS += -D_XOPEN_SOURCE=600
|
||||
@ -286,7 +288,9 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_ASM
|
||||
endif
|
||||
|
||||
# Need SSE2 or higher for these tests
|
||||
ifneq ($(SSE2_FLAG),)
|
||||
|
||||
TPROG = TestPrograms/test_x86_ssse3.cpp
|
||||
TOPT = $(SSSE3_FLAG)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
@ -295,6 +299,8 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CHAM_FLAG = $(SSSE3_FLAG)
|
||||
KECCAK_FLAG = $(SSSE3_FLAG)
|
||||
LEA_FLAG = $(SSSE3_FLAG)
|
||||
LSH256_FLAG = $(SSSE3_FLAG)
|
||||
LSH512_FLAG = $(SSSE3_FLAG)
|
||||
SIMON128_FLAG = $(SSSE3_FLAG)
|
||||
SPECK128_FLAG = $(SSSE3_FLAG)
|
||||
SUN_LDFLAGS += $(SSSE3_FLAG)
|
||||
@ -302,6 +308,12 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
SSSE3_FLAG =
|
||||
endif
|
||||
|
||||
# The first Apple MacBooks were Core2's with SSE4.1
|
||||
ifneq ($(IS_DARWIN),0)
|
||||
# Add SSE2 algo's here as required
|
||||
# They get a free upgrade
|
||||
endif
|
||||
|
||||
TPROG = TestPrograms/test_x86_sse41.cpp
|
||||
TOPT = $(SSE41_FLAG)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
@ -360,6 +372,8 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
CHACHA_AVX2_FLAG = $(AVX2_FLAG)
|
||||
LSH256_AVX2_FLAG = $(AVX2_FLAG)
|
||||
LSH512_AVX2_FLAG = $(AVX2_FLAG)
|
||||
SUN_LDFLAGS += $(AVX2_FLAG)
|
||||
else
|
||||
AVX2_FLAG =
|
||||
@ -420,7 +434,7 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
# CRYPTOPP_DISABLE_MIXED_ASM is now being added in config_asm.h for all
|
||||
# Clang compilers. This test will need to be re-enabled if Clang fixes it.
|
||||
#TPROG = TestPrograms/test_asm_mixed.cpp
|
||||
#HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w)
|
||||
#HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
#ifneq ($(strip $(HAVE_OPT)),0)
|
||||
# CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_MIXED_ASM
|
||||
#endif
|
||||
@ -1057,7 +1071,7 @@ endif # Valgrind
|
||||
# Newlib test due to http://sourceware.org/bugzilla/show_bug.cgi?id=20268
|
||||
ifneq ($(filter -DDEBUG -DDEBUG=1,$(CXXFLAGS)),)
|
||||
TPROG = TestPrograms/test_cxx.cpp
|
||||
USING_GLIBCXX := $(shell $(CXX)$(CXXFLAGS) -E $(TPROG) -o $(TOUT) 2>&1 | $(GREP) -i -c "__GLIBCXX__")
|
||||
USING_GLIBCXX := $(shell $(CXX)$(CXXFLAGS) -E $(TPROG) -c 2>&1 | $(GREP) -i -c "__GLIBCXX__")
|
||||
ifneq ($(USING_GLIBCXX),0)
|
||||
ifeq ($(HAS_NEWLIB),0)
|
||||
ifeq ($(findstring -D_GLIBCXX_DEBUG,$(CXXFLAGS)),)
|
||||
@ -1621,6 +1635,22 @@ keccak_simd.o : keccak_simd.cpp
|
||||
lea_simd.o : lea_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LEA_FLAG) -c) $<
|
||||
|
||||
# SSSE3 available
|
||||
lsh256_sse.o : lsh256_sse.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_FLAG) -c) $<
|
||||
|
||||
# AVX2 available
|
||||
lsh256_avx.o : lsh256_avx.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_AVX2_FLAG) -c) $<
|
||||
|
||||
# SSSE3 available
|
||||
lsh512_sse.o : lsh512_sse.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_FLAG) -c) $<
|
||||
|
||||
# AVX2 available
|
||||
lsh512_avx.o : lsh512_avx.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_AVX2_FLAG) -c) $<
|
||||
|
||||
# NEON available
|
||||
neon_simd.o : neon_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(NEON_FLAG) -c) $<
|
||||
|
@ -46,11 +46,11 @@ endif
|
||||
|
||||
IS_LINUX := $(shell echo $(MACHINEX) | $(GREP) -i -c "Linux")
|
||||
|
||||
# Can be used by Android and Embeeded cross-compiles. Disable by default because
|
||||
# Can be used by Android and Embedded cross-compiles. Disable by default because
|
||||
# Android and embedded users typically don't run this configuration.
|
||||
HAS_SOLIB_VERSION ?= 0
|
||||
|
||||
# Formely adhoc.cpp was created from adhoc.cpp.proto when needed.
|
||||
# Formerly adhoc.cpp was created from adhoc.cpp.proto when needed.
|
||||
# This is now needed because ISA tests are performed using adhoc.cpp.
|
||||
ifeq ($(wildcard adhoc.cpp),)
|
||||
$(shell cp adhoc.cpp.proto adhoc.cpp)
|
||||
@ -192,9 +192,9 @@ else ifeq ($(findstring clean,$(MAKECMDGOALS)),clean)
|
||||
DETECT_FEATURES := 0
|
||||
else ifeq ($(findstring distclean,$(MAKECMDGOALS)),distclean)
|
||||
DETECT_FEATURES := 0
|
||||
else ifeq ($(findstring distclean,$(MAKECMDGOALS)),trim)
|
||||
else ifeq ($(findstring trim,$(MAKECMDGOALS)),trim)
|
||||
DETECT_FEATURES := 0
|
||||
else ifeq ($(IS_IOS),1)
|
||||
else ifeq ($(findstring zip,$(MAKECMDGOALS)),zip)
|
||||
DETECT_FEATURES := 0
|
||||
endif
|
||||
|
||||
@ -249,6 +249,7 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_ASM
|
||||
endif
|
||||
|
||||
# Need SSE2 or higher for these tests
|
||||
ifneq ($(SSE2_FLAG),)
|
||||
TPROG = TestPrograms/test_x86_ssse3.cpp
|
||||
TOPT = $(SSSE3_FLAG)
|
||||
@ -258,20 +259,26 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CHAM_FLAG = $(SSSE3_FLAG)
|
||||
KECCAK_FLAG = $(SSSE3_FLAG)
|
||||
LEA_FLAG = $(SSSE3_FLAG)
|
||||
LSH256_FLAG = $(SSSE3_FLAG)
|
||||
LSH512_FLAG = $(SSSE3_FLAG)
|
||||
SIMON128_FLAG = $(SSSE3_FLAG)
|
||||
SPECK128_FLAG = $(SSSE3_FLAG)
|
||||
SUN_LDFLAGS += $(SSSE3_FLAG)
|
||||
else
|
||||
SSSE3_FLAG =
|
||||
endif
|
||||
|
||||
# The first Apple MacBooks were Core2's with SSE4.1
|
||||
ifneq ($(IS_DARWIN),0)
|
||||
# Add SSE2 algo's here as required
|
||||
# They get a free upgrade
|
||||
endif
|
||||
|
||||
TPROG = TestPrograms/test_x86_sse41.cpp
|
||||
TOPT = $(SSE41_FLAG)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
BLAKE2B_FLAG = $(SSE41_FLAG)
|
||||
BLAKE2S_FLAG = $(SSE41_FLAG)
|
||||
SUN_LDFLAGS += $(SSE41_FLAG)
|
||||
else
|
||||
SSE41_FLAG =
|
||||
endif
|
||||
@ -281,7 +288,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
CRC_FLAG = $(SSE42_FLAG)
|
||||
SUN_LDFLAGS += $(SSE42_FLAG)
|
||||
else
|
||||
SSE42_FLAG =
|
||||
endif
|
||||
@ -292,7 +298,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
GCM_FLAG = $(SSSE3_FLAG) $(CLMUL_FLAG)
|
||||
GF2N_FLAG = $(CLMUL_FLAG)
|
||||
SUN_LDFLAGS += $(CLMUL_FLAG)
|
||||
else
|
||||
CLMUL_FLAG =
|
||||
endif
|
||||
@ -303,7 +308,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
AES_FLAG = $(SSE41_FLAG) $(AESNI_FLAG)
|
||||
SM4_FLAG = $(SSSE3_FLAG) $(AESNI_FLAG)
|
||||
SUN_LDFLAGS += $(AESNI_FLAG)
|
||||
else
|
||||
AESNI_FLAG =
|
||||
endif
|
||||
@ -313,7 +317,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
# XXX_FLAG = $(AVX_FLAG)
|
||||
SUN_LDFLAGS += $(AVX_FLAG)
|
||||
else
|
||||
AVX_FLAG =
|
||||
endif
|
||||
@ -323,7 +326,8 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
CHACHA_AVX2_FLAG = $(AVX2_FLAG)
|
||||
SUN_LDFLAGS += $(AVX2_FLAG)
|
||||
LSH256_AVX2_FLAG = $(AVX2_FLAG)
|
||||
LSH512_AVX2_FLAG = $(AVX2_FLAG)
|
||||
else
|
||||
AVX2_FLAG =
|
||||
endif
|
||||
@ -333,15 +337,10 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
SHA_FLAG = $(SSE42_FLAG) $(SHANI_FLAG)
|
||||
SUN_LDFLAGS += $(SHANI_FLAG)
|
||||
else
|
||||
SHANI_FLAG =
|
||||
endif
|
||||
|
||||
ifeq ($(SUN_COMPILER),1)
|
||||
CRYPTOPP_LDFLAGS += $(SUN_LDFLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(SSE3_FLAG),)
|
||||
CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_SSE3
|
||||
else ifeq ($(SSSE3_FLAG),)
|
||||
@ -383,7 +382,7 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
# CRYPTOPP_DISABLE_MIXED_ASM is now being added in config_asm.h for all
|
||||
# Clang compilers. This test will need to be re-enabled if Clang fixes it.
|
||||
#TPROG = TestPrograms/test_asm_mixed.cpp
|
||||
#HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w)
|
||||
#HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
|
||||
#ifneq ($(strip $(HAVE_OPT)),0)
|
||||
# CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_MIXED_ASM
|
||||
#endif
|
||||
@ -989,6 +988,22 @@ keccak_simd.o : keccak_simd.cpp
|
||||
lea_simd.o : lea_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LEA_FLAG) -c) $<
|
||||
|
||||
# SSSE3 available
|
||||
lsh256_sse.o : lsh256_sse.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_FLAG) -c) $<
|
||||
|
||||
# AVX2 available
|
||||
lsh256_avx.o : lsh256_avx.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_AVX2_FLAG) -c) $<
|
||||
|
||||
# SSSE3 available
|
||||
lsh512_sse.o : lsh512_sse.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_FLAG) -c) $<
|
||||
|
||||
# AVX2 available
|
||||
lsh512_avx.o : lsh512_avx.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_AVX2_FLAG) -c) $<
|
||||
|
||||
# NEON available
|
||||
neon_simd.o : neon_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(NEON_FLAG) -c) $<
|
||||
|
@ -191,30 +191,4 @@
|
||||
# pragma GCC diagnostic ignored "-Wunused-function"
|
||||
#endif
|
||||
|
||||
// Requires ifunc support: GCC 4.8, Binutils 2.20.1 and libc 2.11.1.
|
||||
// Should work for Clang 7 and above: https://stackoverflow.com/q/39958935,
|
||||
// but fails with Clang 10: https://bugs.llvm.org/show_bug.cgi?id=50025.
|
||||
// Should work with GCC 4.8.4 and 7.5.0 but does not:
|
||||
// https://travis-ci.org/github/noloader/cryptopp-cmake/jobs/767701720 and
|
||||
// https://travis-ci.org/github/noloader/cryptopp/jobs/767704226.
|
||||
// Not available on Apple and Solaris platforms. Also see
|
||||
// https://sourceware.org/glibc/wiki/GNU_IFUNC and
|
||||
// https://gcc.gnu.org/onlinedocs/gcc/Function-Multiversioning.html.
|
||||
#if !defined(CRYPTOPP_DISABLE_ASM)
|
||||
# if defined(__linux__)
|
||||
# if defined(__i386__) || defined(__i686__) || defined(__amd64__)
|
||||
# if (CRYPTOPP_GCC_VERSION >= 80000) || (CRYPTOPP_LLVM_CLANG_VERSION >= 130000)
|
||||
# include <x86intrin.h>
|
||||
# define CRYPTOPP_HAVE_ATTRIBUTE_TARGET 1
|
||||
# define CRYPTOPP_TARGET_DEFAULT __attribute__ ((target ("default")))
|
||||
# define CRYPTOPP_TARGET_SSSE3 __attribute__ ((target ("ssse3")))
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef CRYPTOPP_TARGET_DEFAULT
|
||||
# define CRYPTOPP_TARGET_DEFAULT
|
||||
#endif
|
||||
|
||||
#endif // CRYPTOPP_CONFIG_MISC_H
|
||||
|
@ -70,20 +70,21 @@ LIB_SRCS = \
|
||||
gfpcrypt.cpp gost.cpp gzip.cpp hc128.cpp hc256.cpp hex.cpp hight.cpp \
|
||||
hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp \
|
||||
kalynatab.cpp keccak.cpp keccak_core.cpp keccak_simd.cpp lea.cpp \
|
||||
lea_simd.cpp lsh256.cpp lsh512.cpp luc.cpp mars.cpp marss.cpp md2.cpp \
|
||||
md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp \
|
||||
oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp \
|
||||
polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabbit.cpp rabin.cpp \
|
||||
randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp \
|
||||
rijndael.cpp rijndael_simd.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp \
|
||||
safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha.cpp \
|
||||
sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp shake.cpp shark.cpp \
|
||||
sharkbox.cpp simeck.cpp simon.cpp simon128_simd.cpp skipjack.cpp sm3.cpp \
|
||||
sm4.cpp sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp \
|
||||
square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \
|
||||
threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \
|
||||
vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp \
|
||||
zdeflate.cpp zinflate.cpp zlib.cpp
|
||||
lea_simd.cpp lsh256.cpp lsh256_avx.cpp lsh256_sse.cpp lsh512.cpp \
|
||||
lsh512_avx.cpp lsh512_sse.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp \
|
||||
md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp oaep.cpp \
|
||||
osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp \
|
||||
pssr.cpp pubkey.cpp queue.cpp rabbit.cpp rabin.cpp randpool.cpp rc2.cpp \
|
||||
rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp rijndael_simd.cpp \
|
||||
ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp \
|
||||
seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp \
|
||||
shacal2_simd.cpp shake.cpp shark.cpp sharkbox.cpp simeck.cpp simon.cpp \
|
||||
simon128_simd.cpp skipjack.cpp sm3.cpp sm4.cpp sm4_simd.cpp \
|
||||
sosemanuk.cpp speck.cpp speck128_simd.cpp square.cpp squaretb.cpp \
|
||||
sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp \
|
||||
tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wake.cpp \
|
||||
whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp zdeflate.cpp \
|
||||
zinflate.cpp zlib.cpp
|
||||
|
||||
LIB_OBJS = \
|
||||
cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj \
|
||||
@ -100,20 +101,21 @@ LIB_OBJS = \
|
||||
gfpcrypt.obj gost.obj gzip.obj hc128.obj hc256.obj hex.obj hight.obj \
|
||||
hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj \
|
||||
kalynatab.obj keccak.obj keccak_core.obj keccak_simd.obj lea.obj \
|
||||
lea_simd.obj lsh256.obj lsh512.obj luc.obj mars.obj marss.obj md2.obj \
|
||||
md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj \
|
||||
oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj \
|
||||
polynomi.obj pssr.obj pubkey.obj queue.obj rabbit.obj rabin.obj \
|
||||
randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj \
|
||||
rijndael.obj rijndael_simd.obj ripemd.obj rng.obj rsa.obj rw.obj \
|
||||
safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha.obj \
|
||||
sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj shake.obj shark.obj \
|
||||
sharkbox.obj simeck.obj simon.obj simon128_simd.obj skipjack.obj sm3.obj \
|
||||
sm4.obj sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj \
|
||||
square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \
|
||||
threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \
|
||||
vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj \
|
||||
zdeflate.obj zinflate.obj zlib.obj
|
||||
lea_simd.obj lsh256.obj lsh256_avx.obj lsh256_sse.obj lsh512.obj \
|
||||
lsh512_avx.obj lsh512_sse.obj luc.obj mars.obj marss.obj md2.obj md4.obj \
|
||||
md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj oaep.obj \
|
||||
osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj \
|
||||
pssr.obj pubkey.obj queue.obj rabbit.obj rabin.obj randpool.obj rc2.obj \
|
||||
rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael.obj rijndael_simd.obj \
|
||||
ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj \
|
||||
seal.obj seed.obj serpent.obj sha.obj sha3.obj sha_simd.obj shacal2.obj \
|
||||
shacal2_simd.obj shake.obj shark.obj sharkbox.obj simeck.obj simon.obj \
|
||||
simon128_simd.obj skipjack.obj sm3.obj sm4.obj sm4_simd.obj \
|
||||
sosemanuk.obj speck.obj speck128_simd.obj square.obj squaretb.obj \
|
||||
sse_simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj \
|
||||
tigertab.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wake.obj \
|
||||
whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj zdeflate.obj \
|
||||
zinflate.obj zlib.obj
|
||||
|
||||
ASM_OBJS = \
|
||||
rdrand-x86.obj rdrand-x64.obj rdseed-x86.obj rdseed-x64.obj x64masm.obj x64dll.obj
|
||||
@ -311,6 +313,10 @@ x64dll.obj: x64dll.asm
|
||||
!IF "$(PLATFORM)" == "x64" || "$(PLATFORM)" == "X64" || "$(PLATFORM)" == "amd64" || "$(PLATFORM)" == "x86" || "$(PLATFORM)" == "X86"
|
||||
chacha_avx.obj:
|
||||
$(CXX) $(CXXFLAGS) /arch:AVX /c chacha_avx.cpp
|
||||
lsh256_avx.obj:
|
||||
$(CXX) $(CXXFLAGS) /arch:AVX /c lsh256_avx.cpp
|
||||
lsh512_avx.obj:
|
||||
$(CXX) $(CXXFLAGS) /arch:AVX /c lsh512_avx.cpp
|
||||
!endif
|
||||
|
||||
# For testing cryptopp.dll and CRYPTOPP_IMPORTS
|
||||
|
@ -263,7 +263,11 @@
|
||||
<ClCompile Include="lea.cpp" />
|
||||
<ClCompile Include="lea_simd.cpp" />
|
||||
<ClCompile Include="lsh256.cpp" />
|
||||
<ClCompile Include="lsh256_sse.cpp" />
|
||||
<ClCompile Include="lsh256_avx.cpp" />
|
||||
<ClCompile Include="lsh512.cpp" />
|
||||
<ClCompile Include="lsh512_sse.cpp" />
|
||||
<ClCompile Include="lsh512_avx.cpp" />
|
||||
<ClCompile Include="luc.cpp" />
|
||||
<ClCompile Include="mars.cpp" />
|
||||
<ClCompile Include="marss.cpp" />
|
||||
|
@ -275,9 +275,21 @@
|
||||
<ClCompile Include="lsh256.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lsh256_sse.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lsh256_avx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lsh512.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lsh512_sse.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lsh512_avx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="luc.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
2
darn.cpp
2
darn.cpp
@ -15,7 +15,7 @@
|
||||
// GCC inline assembly or the builtin will fail the compile.
|
||||
|
||||
// Inline assembler available in GCC 3.2 or above. For practical
|
||||
// purposes we check for GCC 4.0 or above. GCC imposters claim
|
||||
// purposes we check for GCC 4.0 or above. GCC impostors claim
|
||||
// to be GCC 4.2.1 so it will capture them, too. We exclude the
|
||||
// Apple machines because they are not Power9 and use a slightly
|
||||
// different syntax in their assembler.
|
||||
|
20
datatest.cpp
20
datatest.cpp
@ -241,15 +241,15 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
|
||||
}
|
||||
else if (s1.substr(0, 2) == "0x")
|
||||
{
|
||||
std::string::size_type pos = s1.find(' ');
|
||||
StringSource(s1.substr(2, pos), true, new HexDecoder(new StringSink(s2)));
|
||||
s1 = s1.substr(STDMIN(pos, s1.length()));
|
||||
std::string::size_type n = s1.find(' ');
|
||||
StringSource(s1.substr(2, n), true, new HexDecoder(new StringSink(s2)));
|
||||
s1 = s1.substr(STDMIN(n, s1.length()));
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string::size_type pos = s1.find(' ');
|
||||
StringSource(s1.substr(0, pos), true, new HexDecoder(new StringSink(s2)));
|
||||
s1 = s1.substr(STDMIN(pos, s1.length()));
|
||||
std::string::size_type n = s1.find(' ');
|
||||
StringSource(s1.substr(0, n), true, new HexDecoder(new StringSink(s2)));
|
||||
s1 = s1.substr(STDMIN(n, s1.length()));
|
||||
}
|
||||
|
||||
while (repeat--)
|
||||
@ -850,8 +850,8 @@ void TestSymmetricCipherWithFileSource(TestData &v, const NameValuePairs &overri
|
||||
if (encrypted != ciphertext)
|
||||
{
|
||||
std::cout << "\nincorrectly encrypted: ";
|
||||
StringSource ss(encrypted, false, new HexEncoder(new FileSink(std::cout)));
|
||||
ss.Pump(2048); ss.Flush(false);
|
||||
StringSource sss(encrypted, false, new HexEncoder(new FileSink(std::cout)));
|
||||
sss.Pump(2048); sss.Flush(false);
|
||||
std::cout << "\n";
|
||||
SignalTestFailure();
|
||||
}
|
||||
@ -867,8 +867,8 @@ void TestSymmetricCipherWithFileSource(TestData &v, const NameValuePairs &overri
|
||||
if (decrypted != plaintext)
|
||||
{
|
||||
std::cout << "\nincorrectly decrypted: ";
|
||||
StringSource ss(decrypted, false, new HexEncoder(new FileSink(std::cout)));
|
||||
ss.Pump(256); ss.Flush(false);
|
||||
StringSource sss(decrypted, false, new HexEncoder(new FileSink(std::cout)));
|
||||
sss.Pump(256); sss.Flush(false);
|
||||
std::cout << "\n";
|
||||
SignalTestFailure();
|
||||
}
|
||||
|
31
lsh.h
31
lsh.h
@ -4,6 +4,11 @@
|
||||
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
|
||||
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
|
||||
|
||||
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
|
||||
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
|
||||
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
|
||||
// makes using zeroupper a little tricky.
|
||||
|
||||
/// \file lsh.h
|
||||
/// \brief Classes for the LSH hash functions
|
||||
/// \since Crypto++ 8.6
|
||||
@ -15,6 +20,12 @@
|
||||
#include "cryptlib.h"
|
||||
#include "secblock.h"
|
||||
|
||||
// Enable SSE2 and AVX2 for 64-bit machines.
|
||||
// 32-bit machines slow down with SSE2.
|
||||
#if (CRYPTOPP_BOOL_X32) || (CRYPTOPP_BOOL_X64)
|
||||
# define CRYPTOPP_ENABLE_64BIT_SSE 1
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
/// \brief LSH-224 and LSH-256 hash base class
|
||||
@ -34,14 +45,14 @@ public:
|
||||
unsigned int OptimalDataAlignment() const { return GetAlignmentOf<word32>(); }
|
||||
|
||||
void Restart();
|
||||
void Update(const byte *input, size_t length);
|
||||
void Update(const byte *input, size_t size);
|
||||
void TruncatedFinal(byte *hash, size_t size);
|
||||
|
||||
std::string AlgorithmProvider() const;
|
||||
|
||||
protected:
|
||||
LSH256_Base(unsigned int algType, unsigned int digestSize)
|
||||
: m_algType(algType), m_digestSize(digestSize) {}
|
||||
: m_digestSize(digestSize) { m_state[80] = algType; }
|
||||
|
||||
protected:
|
||||
// Working state is:
|
||||
@ -52,8 +63,10 @@ protected:
|
||||
// * submsg_o_l = 8 32-bit words
|
||||
// * submsg_o_r = 8 32-bit words
|
||||
// * last_block = 32 32-bit words (128 bytes)
|
||||
FixedSizeSecBlock<word32, 80> m_state;
|
||||
word32 m_algType, m_remainingBitLength;
|
||||
// * algType
|
||||
// * remainingBitLength
|
||||
FixedSizeSecBlock<word32, 80+2> m_state;
|
||||
// word32 m_algType, m_remainingBitLength;
|
||||
word32 m_digestSize;
|
||||
};
|
||||
|
||||
@ -132,14 +145,14 @@ public:
|
||||
unsigned int OptimalDataAlignment() const { return GetAlignmentOf<word64>(); }
|
||||
|
||||
void Restart();
|
||||
void Update(const byte *input, size_t length);
|
||||
void Update(const byte *input, size_t size);
|
||||
void TruncatedFinal(byte *hash, size_t size);
|
||||
|
||||
std::string AlgorithmProvider() const;
|
||||
|
||||
protected:
|
||||
LSH512_Base(unsigned int algType, unsigned int digestSize)
|
||||
: m_algType(algType), m_digestSize(digestSize) {}
|
||||
: m_digestSize(digestSize) { m_state[80] = algType; }
|
||||
|
||||
protected:
|
||||
// Working state is:
|
||||
@ -150,8 +163,10 @@ protected:
|
||||
// * submsg_o_l = 8 64-bit words
|
||||
// * submsg_o_r = 8 64-bit words
|
||||
// * last_block = 32 64-bit words (256 bytes)
|
||||
FixedSizeSecBlock<word64, 80> m_state;
|
||||
word32 m_algType, m_remainingBitLength;
|
||||
// * algType
|
||||
// * remainingBitLength
|
||||
FixedSizeSecBlock<word64, 80+2> m_state;
|
||||
// word32 m_algType, m_remainingBitLength;
|
||||
word32 m_digestSize;
|
||||
};
|
||||
|
||||
|
806
lsh256.cpp
806
lsh256.cpp
File diff suppressed because it is too large
Load Diff
647
lsh256_avx.cpp
Normal file
647
lsh256_avx.cpp
Normal file
@ -0,0 +1,647 @@
|
||||
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
|
||||
// Based on the specification and source code provided by
|
||||
// Korea Internet & Security Agency (KISA) website. Also
|
||||
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
|
||||
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
|
||||
|
||||
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
|
||||
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
|
||||
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
|
||||
// makes using zeroupper a little tricky.
|
||||
|
||||
#include "pch.h"
|
||||
#include "config.h"
|
||||
|
||||
#include "lsh.h"
|
||||
#include "misc.h"
|
||||
|
||||
#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
|
||||
|
||||
#if defined(CRYPTOPP_AVX2_AVAILABLE)
|
||||
# include <emmintrin.h>
|
||||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && defined(__amd64__)
|
||||
# include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
/* LSH Constants */
|
||||
|
||||
const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128;
|
||||
// const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024;
|
||||
// const unsigned int LSH256_CV_BYTE_LEN = 64;
|
||||
const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32;
|
||||
|
||||
// const unsigned int MSG_BLK_WORD_LEN = 32;
|
||||
const unsigned int CV_WORD_LEN = 16;
|
||||
const unsigned int CONST_WORD_LEN = 8;
|
||||
const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
|
||||
// const unsigned int WORD_BIT_LEN = 32;
|
||||
const unsigned int NUM_STEPS = 26;
|
||||
|
||||
const unsigned int ROT_EVEN_ALPHA = 29;
|
||||
const unsigned int ROT_EVEN_BETA = 1;
|
||||
const unsigned int ROT_ODD_ALPHA = 5;
|
||||
const unsigned int ROT_ODD_BETA = 17;
|
||||
|
||||
const unsigned int LSH_TYPE_256_256 = 0x0000020;
|
||||
const unsigned int LSH_TYPE_256_224 = 0x000001C;
|
||||
|
||||
// const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224;
|
||||
// const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256;
|
||||
|
||||
/* Error Code */
|
||||
|
||||
const unsigned int LSH_SUCCESS = 0x0;
|
||||
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
|
||||
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
|
||||
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
|
||||
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
|
||||
|
||||
/* Index into our state array */
|
||||
|
||||
const unsigned int AlgorithmType = 80;
|
||||
const unsigned int RemainingBits = 81;
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
NAMESPACE_BEGIN(LSH)
|
||||
|
||||
// lsh256.cpp
|
||||
extern const word32 LSH256_IV224[CV_WORD_LEN];
|
||||
extern const word32 LSH256_IV256[CV_WORD_LEN];
|
||||
extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS];
|
||||
|
||||
NAMESPACE_END // LSH
|
||||
NAMESPACE_END // Crypto++
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
using CryptoPP::byte;
|
||||
using CryptoPP::word32;
|
||||
using CryptoPP::rotlFixed;
|
||||
using CryptoPP::rotlConstant;
|
||||
|
||||
using CryptoPP::GetBlock;
|
||||
using CryptoPP::LittleEndian;
|
||||
using CryptoPP::ConditionalByteReverse;
|
||||
using CryptoPP::LITTLE_ENDIAN_ORDER;
|
||||
|
||||
typedef byte lsh_u8;
|
||||
typedef word32 lsh_u32;
|
||||
typedef word32 lsh_uint;
|
||||
typedef word32 lsh_err;
|
||||
typedef word32 lsh_type;
|
||||
|
||||
using CryptoPP::LSH::LSH256_IV224;
|
||||
using CryptoPP::LSH::LSH256_IV256;
|
||||
using CryptoPP::LSH::LSH256_StepConstants;
|
||||
|
||||
struct LSH256_AVX2_Context
|
||||
{
|
||||
LSH256_AVX2_Context(word32* state, word32 algType, word32& remainingBitLength) :
|
||||
cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
|
||||
last_block(reinterpret_cast<byte*>(state+48)),
|
||||
remain_databitlen(remainingBitLength),
|
||||
alg_type(static_cast<lsh_type>(algType)) {}
|
||||
|
||||
lsh_u32* cv_l; // start of our state block
|
||||
lsh_u32* cv_r;
|
||||
lsh_u32* sub_msgs;
|
||||
lsh_u8* last_block;
|
||||
lsh_u32& remain_databitlen;
|
||||
lsh_type alg_type;
|
||||
};
|
||||
|
||||
struct LSH256_AVX2_Internal
|
||||
{
|
||||
LSH256_AVX2_Internal(word32* state) :
|
||||
submsg_e_l(state+16), submsg_e_r(state+24),
|
||||
submsg_o_l(state+32), submsg_o_r(state+40) { }
|
||||
|
||||
lsh_u32* submsg_e_l; /* even left sub-message */
|
||||
lsh_u32* submsg_e_r; /* even right sub-message */
|
||||
lsh_u32* submsg_o_l; /* odd left sub-message */
|
||||
lsh_u32* submsg_o_r; /* odd right sub-message */
|
||||
};
|
||||
|
||||
// Zero the upper 128 bits of all YMM registers on exit.
|
||||
// It avoids AVX state transition penalties when saving state.
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
|
||||
// makes using zeroupper a little tricky.
|
||||
|
||||
struct AVX_Cleanup
|
||||
{
|
||||
~AVX_Cleanup() {
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
};
|
||||
|
||||
// const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
|
||||
|
||||
/* LSH AlgType Macro */
|
||||
|
||||
inline bool LSH_IS_LSH512(lsh_uint val) {
|
||||
return (val & 0xf0000) == 0;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
|
||||
return val >> 24;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
|
||||
return val & 0xffff;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
|
||||
return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
|
||||
}
|
||||
|
||||
inline lsh_u32 loadLE32(lsh_u32 v) {
|
||||
return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
|
||||
}
|
||||
|
||||
lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) {
|
||||
return rotlFixed(x, r);
|
||||
}
|
||||
|
||||
// Original code relied upon unaligned lsh_u32 buffer
|
||||
inline void load_msg_blk(LSH256_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN])
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u32* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u32* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u32* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_l+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_r+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_l+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_r+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
|
||||
}
|
||||
|
||||
inline void msg_exp_even(LSH256_AVX2_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u32* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u32* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u32* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514,
|
||||
0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c);
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi32(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
|
||||
_mm256_shuffle_epi8(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), mask)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi32(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
|
||||
_mm256_shuffle_epi8(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), mask)));
|
||||
}
|
||||
|
||||
inline void msg_exp_odd(LSH256_AVX2_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u32* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u32* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u32* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514,
|
||||
0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c);
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_l+0), _mm256_add_epi32(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
|
||||
_mm256_shuffle_epi8(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), mask)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_r+0), _mm256_add_epi32(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
|
||||
_mm256_shuffle_epi8(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), mask)));
|
||||
}
|
||||
|
||||
inline void load_sc(const lsh_u32** p_const_v, size_t i)
|
||||
{
|
||||
CRYPTOPP_ASSERT(p_const_v != NULLPTR);
|
||||
|
||||
*p_const_v = &LSH256_StepConstants[i];
|
||||
}
|
||||
|
||||
inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u32* submsg_e_r = i_state->submsg_e_r;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l+0)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0))));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0))));
|
||||
}
|
||||
|
||||
inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u32* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
|
||||
}
|
||||
|
||||
inline void add_blk(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
|
||||
{
|
||||
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi32(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r))));
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline void rotate_blk(lsh_u32 cv[8])
|
||||
{
|
||||
_mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
|
||||
_mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
|
||||
_mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R)));
|
||||
}
|
||||
|
||||
inline void xor_with_const(lsh_u32 cv_l[8], const lsh_u32 const_v[8])
|
||||
{
|
||||
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(const_v))));
|
||||
}
|
||||
|
||||
inline void rotate_msg_gamma(lsh_u32 cv_r[8])
|
||||
{
|
||||
// g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0),
|
||||
_mm256_shuffle_epi8(_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
|
||||
_mm256_set_epi8(
|
||||
/* hi lane */ 15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1,
|
||||
/* lo lane */ 12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0)));
|
||||
}
|
||||
|
||||
inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
|
||||
{
|
||||
__m256i temp = _mm256_shuffle_epi32(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l)), _MM_SHUFFLE(3,1,0,2));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r),
|
||||
_mm256_shuffle_epi32(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(1,2,3,0)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_l),
|
||||
_mm256_permute2x128_si256(temp,
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,3,0,1)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r),
|
||||
_mm256_permute2x128_si256(temp,
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,2,0,0)));
|
||||
};
|
||||
|
||||
/* -------------------------------------------------------- *
|
||||
* step function
|
||||
* -------------------------------------------------------- */
|
||||
|
||||
template <unsigned int Alpha, unsigned int Beta>
|
||||
inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8])
|
||||
{
|
||||
add_blk(cv_l, cv_r);
|
||||
rotate_blk<Alpha>(cv_l);
|
||||
xor_with_const(cv_l, const_v);
|
||||
add_blk(cv_r, cv_l);
|
||||
rotate_blk<Beta>(cv_r);
|
||||
add_blk(cv_l, cv_r);
|
||||
rotate_msg_gamma(cv_r);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- *
|
||||
* compression function
|
||||
* -------------------------------------------------------- */
|
||||
|
||||
inline void compress(LSH256_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN])
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
LSH256_AVX2_Internal s_state(ctx->cv_l);
|
||||
LSH256_AVX2_Internal* i_state = &s_state;
|
||||
|
||||
const lsh_u32* const_v = NULL;
|
||||
lsh_u32* cv_l = ctx->cv_l;
|
||||
lsh_u32* cv_r = ctx->cv_r;
|
||||
|
||||
load_msg_blk(i_state, pdMsgBlk);
|
||||
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 0);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
msg_add_odd(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
for (size_t i = 1; i < NUM_STEPS / 2; i++)
|
||||
{
|
||||
msg_exp_even(i_state);
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 16 * i);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
msg_exp_odd(i_state);
|
||||
msg_add_odd(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 16 * i + 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
}
|
||||
|
||||
msg_exp_even(i_state);
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void load_iv(word32 cv_l[8], word32 cv_r[8], const word32 iv[16])
|
||||
{
|
||||
// The IV's are 32-byte aligned so we can use aligned loads.
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+0),
|
||||
_mm256_load_si256(CONST_M256_CAST(iv+0)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0),
|
||||
_mm256_load_si256(CONST_M256_CAST(iv+8)));
|
||||
}
|
||||
|
||||
inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
|
||||
{
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
|
||||
}
|
||||
|
||||
inline void zero_submsgs(LSH256_AVX2_Context* ctx)
|
||||
{
|
||||
lsh_u32* sub_msgs = ctx->sub_msgs;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(sub_msgs+ 0), _mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(sub_msgs+ 8), _mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(sub_msgs+16), _mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(sub_msgs+24), _mm256_setzero_si256());
|
||||
}
|
||||
|
||||
inline void init224(LSH256_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224);
|
||||
}
|
||||
|
||||
inline void init256(LSH256_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void fin(LSH256_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void get_hash(LSH256_AVX2_Context* ctx, lsh_u8* pbHashVal)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
|
||||
|
||||
lsh_uint alg_type = ctx->alg_type;
|
||||
lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
|
||||
lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
|
||||
|
||||
// Multiplying by looks odd...
|
||||
memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
|
||||
if (hash_val_bit_len){
|
||||
pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
|
||||
}
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
lsh_err lsh256_init_avx2(LSH256_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
|
||||
lsh_u32 alg_type = ctx->alg_type;
|
||||
const lsh_u32* const_v = NULL;
|
||||
ctx->remain_databitlen = 0;
|
||||
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
|
||||
AVX_Cleanup cleanup;
|
||||
|
||||
switch (alg_type)
|
||||
{
|
||||
case LSH_TYPE_256_256:
|
||||
init256(ctx);
|
||||
return LSH_SUCCESS;
|
||||
case LSH_TYPE_256_224:
|
||||
init224(ctx);
|
||||
return LSH_SUCCESS;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
lsh_u32* cv_l = ctx->cv_l;
|
||||
lsh_u32* cv_r = ctx->cv_r;
|
||||
|
||||
zero_iv(cv_l, cv_r);
|
||||
cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN;
|
||||
cv_l[1] = LSH_GET_HASHBIT(alg_type);
|
||||
|
||||
for (size_t i = 0; i < NUM_STEPS / 2; i++)
|
||||
{
|
||||
//Mix
|
||||
load_sc(&const_v, i * 16);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
load_sc(&const_v, i * 16 + 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
}
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
lsh_err lsh256_update_avx2(LSH256_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(data != NULLPTR);
|
||||
CRYPTOPP_ASSERT(databitlen % 8 == 0);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
|
||||
AVX_Cleanup cleanup;
|
||||
|
||||
if (databitlen == 0){
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
// We are byte oriented. tail bits will always be 0.
|
||||
size_t databytelen = databitlen >> 3;
|
||||
// lsh_uint pos2 = databitlen & 0x7;
|
||||
const size_t pos2 = 0;
|
||||
|
||||
size_t remain_msg_byte = ctx->remain_databitlen >> 3;
|
||||
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
|
||||
const size_t remain_msg_bit = 0;
|
||||
|
||||
if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
|
||||
return LSH_ERR_INVALID_STATE;
|
||||
}
|
||||
if (remain_msg_bit > 0){
|
||||
return LSH_ERR_INVALID_DATABITLEN;
|
||||
}
|
||||
|
||||
if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN)
|
||||
{
|
||||
memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
|
||||
ctx->remain_databitlen += (lsh_uint)databitlen;
|
||||
remain_msg_byte += (lsh_uint)databytelen;
|
||||
if (pos2){
|
||||
ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
|
||||
}
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
if (remain_msg_byte > 0){
|
||||
size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte;
|
||||
memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
|
||||
compress(ctx, ctx->last_block);
|
||||
data += more_byte;
|
||||
databytelen -= more_byte;
|
||||
remain_msg_byte = 0;
|
||||
ctx->remain_databitlen = 0;
|
||||
}
|
||||
|
||||
while (databytelen >= LSH256_MSG_BLK_BYTE_LEN)
|
||||
{
|
||||
// This call to compress caused some trouble.
|
||||
// The data pointer can become unaligned in the
|
||||
// previous block.
|
||||
compress(ctx, data);
|
||||
data += LSH256_MSG_BLK_BYTE_LEN;
|
||||
databytelen -= LSH256_MSG_BLK_BYTE_LEN;
|
||||
}
|
||||
|
||||
if (databytelen > 0){
|
||||
memcpy(ctx->last_block, data, databytelen);
|
||||
ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
|
||||
}
|
||||
|
||||
if (pos2){
|
||||
ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
|
||||
ctx->remain_databitlen += pos2;
|
||||
}
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
lsh_err lsh256_final_avx2(LSH256_AVX2_Context* ctx, lsh_u8* hashval)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(hashval != NULLPTR);
|
||||
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
|
||||
AVX_Cleanup cleanup;
|
||||
|
||||
// We are byte oriented. tail bits will always be 0.
|
||||
size_t remain_msg_byte = ctx->remain_databitlen >> 3;
|
||||
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
|
||||
const size_t remain_msg_bit = 0;
|
||||
|
||||
if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
|
||||
return LSH_ERR_INVALID_STATE;
|
||||
}
|
||||
|
||||
if (remain_msg_bit){
|
||||
ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
|
||||
}
|
||||
else{
|
||||
ctx->last_block[remain_msg_byte] = 0x80;
|
||||
}
|
||||
memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
|
||||
|
||||
compress(ctx, ctx->last_block);
|
||||
|
||||
fin(ctx);
|
||||
get_hash(ctx, hashval);
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
extern
|
||||
void LSH256_Base_Restart_AVX2(word32* state)
|
||||
{
|
||||
state[RemainingBits] = 0;
|
||||
LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh256_init_avx2(&ctx);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_init_avx2 failed");
|
||||
}
|
||||
|
||||
extern
|
||||
void LSH256_Base_Update_AVX2(word32* state, const byte *input, size_t size)
|
||||
{
|
||||
LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh256_update_avx2(&ctx, input, 8*size);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_update_avx2 failed");
|
||||
}
|
||||
|
||||
extern
|
||||
void LSH256_Base_TruncatedFinal_AVX2(word32* state, byte *hash, size_t)
|
||||
{
|
||||
LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh256_final_avx2(&ctx, hash);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_final_avx2 failed");
|
||||
}
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
#endif // CRYPTOPP_AVX2_AVAILABLE
|
709
lsh256_sse.cpp
Normal file
709
lsh256_sse.cpp
Normal file
@ -0,0 +1,709 @@
|
||||
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
|
||||
// Based on the specification and source code provided by
|
||||
// Korea Internet & Security Agency (KISA) website. Also
|
||||
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
|
||||
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
|
||||
|
||||
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
|
||||
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
|
||||
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
|
||||
// makes using zeroupper a little tricky.
|
||||
|
||||
#include "pch.h"
|
||||
#include "config.h"
|
||||
|
||||
#include "lsh.h"
|
||||
#include "cpu.h"
|
||||
#include "misc.h"
|
||||
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
|
||||
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
# include <emmintrin.h>
|
||||
# include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(CRYPTOPP_XOP_AVAILABLE)
|
||||
# include <ammintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && defined(__amd64__)
|
||||
# include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
/* LSH Constants */
|
||||
|
||||
const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128;
|
||||
// const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024;
|
||||
// const unsigned int LSH256_CV_BYTE_LEN = 64;
|
||||
const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32;
|
||||
|
||||
// const unsigned int MSG_BLK_WORD_LEN = 32;
|
||||
const unsigned int CV_WORD_LEN = 16;
|
||||
const unsigned int CONST_WORD_LEN = 8;
|
||||
const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
|
||||
// const unsigned int WORD_BIT_LEN = 32;
|
||||
const unsigned int NUM_STEPS = 26;
|
||||
|
||||
const unsigned int ROT_EVEN_ALPHA = 29;
|
||||
const unsigned int ROT_EVEN_BETA = 1;
|
||||
const unsigned int ROT_ODD_ALPHA = 5;
|
||||
const unsigned int ROT_ODD_BETA = 17;
|
||||
|
||||
const unsigned int LSH_TYPE_256_256 = 0x0000020;
|
||||
const unsigned int LSH_TYPE_256_224 = 0x000001C;
|
||||
|
||||
// const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224;
|
||||
// const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256;
|
||||
|
||||
/* Error Code */
|
||||
|
||||
const unsigned int LSH_SUCCESS = 0x0;
|
||||
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
|
||||
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
|
||||
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
|
||||
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
|
||||
|
||||
/* Index into our state array */
|
||||
|
||||
const unsigned int AlgorithmType = 80;
|
||||
const unsigned int RemainingBits = 81;
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
NAMESPACE_BEGIN(LSH)
|
||||
|
||||
// lsh256.cpp
|
||||
extern const word32 LSH256_IV224[CV_WORD_LEN];
|
||||
extern const word32 LSH256_IV256[CV_WORD_LEN];
|
||||
extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS];
|
||||
|
||||
NAMESPACE_END // LSH
|
||||
NAMESPACE_END // Crypto++
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
using CryptoPP::byte;
|
||||
using CryptoPP::word32;
|
||||
using CryptoPP::rotlFixed;
|
||||
using CryptoPP::rotlConstant;
|
||||
|
||||
using CryptoPP::GetBlock;
|
||||
using CryptoPP::LittleEndian;
|
||||
using CryptoPP::ConditionalByteReverse;
|
||||
using CryptoPP::LITTLE_ENDIAN_ORDER;
|
||||
|
||||
typedef byte lsh_u8;
|
||||
typedef word32 lsh_u32;
|
||||
typedef word32 lsh_uint;
|
||||
typedef word32 lsh_err;
|
||||
typedef word32 lsh_type;
|
||||
|
||||
using CryptoPP::LSH::LSH256_IV224;
|
||||
using CryptoPP::LSH::LSH256_IV256;
|
||||
using CryptoPP::LSH::LSH256_StepConstants;
|
||||
|
||||
struct LSH256_SSSE3_Context
|
||||
{
|
||||
LSH256_SSSE3_Context(word32* state, word32 algType, word32& remainingBitLength) :
|
||||
cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
|
||||
last_block(reinterpret_cast<byte*>(state+48)),
|
||||
remain_databitlen(remainingBitLength),
|
||||
alg_type(static_cast<lsh_type>(algType)) {}
|
||||
|
||||
lsh_u32* cv_l; // start of our state block
|
||||
lsh_u32* cv_r;
|
||||
lsh_u32* sub_msgs;
|
||||
lsh_u8* last_block;
|
||||
lsh_u32& remain_databitlen;
|
||||
lsh_type alg_type;
|
||||
};
|
||||
|
||||
struct LSH256_SSSE3_Internal
|
||||
{
|
||||
LSH256_SSSE3_Internal(word32* state) :
|
||||
submsg_e_l(state+16), submsg_e_r(state+24),
|
||||
submsg_o_l(state+32), submsg_o_r(state+40) { }
|
||||
|
||||
lsh_u32* submsg_e_l; /* even left sub-message */
|
||||
lsh_u32* submsg_e_r; /* even right sub-message */
|
||||
lsh_u32* submsg_o_l; /* odd left sub-message */
|
||||
lsh_u32* submsg_o_r; /* odd right sub-message */
|
||||
};
|
||||
|
||||
const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
|
||||
|
||||
/* LSH AlgType Macro */
|
||||
|
||||
inline bool LSH_IS_LSH512(lsh_uint val) {
|
||||
return (val & 0xf0000) == 0;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
|
||||
return val >> 24;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
|
||||
return val & 0xffff;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
|
||||
return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
|
||||
}
|
||||
|
||||
inline lsh_u32 loadLE32(lsh_u32 v) {
|
||||
return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
|
||||
}
|
||||
|
||||
lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) {
|
||||
return rotlFixed(x, r);
|
||||
}
|
||||
|
||||
// Original code relied upon unaligned lsh_u32 buffer
|
||||
inline void load_msg_blk(LSH256_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN])
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
lsh_u32* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u32* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u32* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u32* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+0)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+16)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+32)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+48)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+64)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+80)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+96)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+112)));
|
||||
}
|
||||
|
||||
inline void msg_exp_even(LSH256_SSSE3_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u32* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u32* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u32* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi32(
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(3,2,1,0)),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(1,0,2,3))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi32(
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(3,2,1,0)),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(2,1,0,3))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi32(
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(3,2,1,0)),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(1,0,2,3))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi32(
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(3,2,1,0)),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(2,1,0,3))));
|
||||
}
|
||||
|
||||
inline void msg_exp_odd(LSH256_SSSE3_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u32* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u32* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u32* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi32(
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(3,2,1,0)),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(1,0,2,3))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi32(
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(3,2,1,0)),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(2,1,0,3))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi32(
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(3,2,1,0)),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(1,0,2,3))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi32(
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(3,2,1,0)),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(2,1,0,3))));
|
||||
}
|
||||
|
||||
inline void load_sc(const lsh_u32** p_const_v, size_t i)
|
||||
{
|
||||
CRYPTOPP_ASSERT(p_const_v != NULLPTR);
|
||||
|
||||
*p_const_v = &LSH256_StepConstants[i];
|
||||
}
|
||||
|
||||
inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u32* submsg_e_r = i_state->submsg_e_r;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
|
||||
}
|
||||
|
||||
inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u32* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u32* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
|
||||
}
|
||||
|
||||
inline void add_blk(lsh_u32 cv_l[8], const lsh_u32 cv_r[8])
|
||||
{
|
||||
_mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4))));
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline void rotate_blk(lsh_u32 cv[8])
|
||||
{
|
||||
#if defined(CRYPTOPP_XOP_AVAILABLE)
|
||||
_mm_storeu_si128(M128_CAST(cv),
|
||||
_mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R));
|
||||
_mm_storeu_si128(M128_CAST(cv+4),
|
||||
_mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R));
|
||||
#else
|
||||
_mm_storeu_si128(M128_CAST(cv), _mm_or_si128(
|
||||
_mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R),
|
||||
_mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), 32-R)));
|
||||
_mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128(
|
||||
_mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R),
|
||||
_mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 32-R)));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void xor_with_const(lsh_u32* cv_l, const lsh_u32* const_v)
|
||||
{
|
||||
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(const_v))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(const_v+4))));
|
||||
}
|
||||
|
||||
inline void rotate_msg_gamma(lsh_u32 cv_r[8])
|
||||
{
|
||||
// g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0),
|
||||
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
|
||||
_mm_set_epi8(12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4),
|
||||
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
|
||||
_mm_set_epi8(15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1)));
|
||||
}
|
||||
|
||||
inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
|
||||
{
|
||||
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
|
||||
|
||||
__m128i temp = _mm_loadu_si128(CONST_M128_CAST(cv_l+0));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0), temp);
|
||||
};
|
||||
|
||||
/* -------------------------------------------------------- *
|
||||
* step function
|
||||
* -------------------------------------------------------- */
|
||||
|
||||
template <unsigned int Alpha, unsigned int Beta>
|
||||
inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8])
|
||||
{
|
||||
add_blk(cv_l, cv_r);
|
||||
rotate_blk<Alpha>(cv_l);
|
||||
xor_with_const(cv_l, const_v);
|
||||
add_blk(cv_r, cv_l);
|
||||
rotate_blk<Beta>(cv_r);
|
||||
add_blk(cv_l, cv_r);
|
||||
rotate_msg_gamma(cv_r);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- *
|
||||
* compression function
|
||||
* -------------------------------------------------------- */
|
||||
|
||||
inline void compress(LSH256_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN])
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
LSH256_SSSE3_Internal s_state(ctx->cv_l);
|
||||
LSH256_SSSE3_Internal* i_state = &s_state;
|
||||
|
||||
const lsh_u32* const_v = NULL;
|
||||
lsh_u32* cv_l = ctx->cv_l;
|
||||
lsh_u32* cv_r = ctx->cv_r;
|
||||
|
||||
load_msg_blk(i_state, pdMsgBlk);
|
||||
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 0);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
msg_add_odd(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
for (size_t i = 1; i < NUM_STEPS / 2; i++)
|
||||
{
|
||||
msg_exp_even(i_state);
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 16 * i);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
msg_exp_odd(i_state);
|
||||
msg_add_odd(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 16 * i + 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
}
|
||||
|
||||
msg_exp_even(i_state);
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void load_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 iv[16])
|
||||
{
|
||||
_mm_storeu_si128(M128_CAST(cv_l+ 0),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+ 0)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+ 4),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+ 4)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+ 0),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+ 8)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+ 4),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+12)));
|
||||
}
|
||||
|
||||
inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
|
||||
{
|
||||
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128());
|
||||
}
|
||||
|
||||
inline void zero_submsgs(LSH256_SSSE3_Context* ctx)
|
||||
{
|
||||
lsh_u32* sub_msgs = ctx->sub_msgs;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+ 0), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+ 4), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+ 8), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+12), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+16), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+20), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+24), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+28), _mm_setzero_si128());
|
||||
}
|
||||
|
||||
inline void init224(LSH256_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224);
|
||||
}
|
||||
|
||||
inline void init256(LSH256_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void fin(LSH256_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
_mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0))));
|
||||
_mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4))));
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void get_hash(LSH256_SSSE3_Context* ctx, lsh_u8* pbHashVal)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
|
||||
|
||||
lsh_uint alg_type = ctx->alg_type;
|
||||
lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
|
||||
lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
|
||||
|
||||
// Multiplying by sizeof(lsh_u8) looks odd...
|
||||
memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
|
||||
if (hash_val_bit_len){
|
||||
pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
|
||||
}
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
lsh_err lsh256_ssse3_init(LSH256_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
|
||||
lsh_u32 alg_type = ctx->alg_type;
|
||||
const lsh_u32* const_v = NULL;
|
||||
ctx->remain_databitlen = 0;
|
||||
|
||||
switch (alg_type)
|
||||
{
|
||||
case LSH_TYPE_256_256:
|
||||
init256(ctx);
|
||||
return LSH_SUCCESS;
|
||||
case LSH_TYPE_256_224:
|
||||
init224(ctx);
|
||||
return LSH_SUCCESS;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
lsh_u32* cv_l = ctx->cv_l;
|
||||
lsh_u32* cv_r = ctx->cv_r;
|
||||
|
||||
zero_iv(cv_l, cv_r);
|
||||
cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN;
|
||||
cv_l[1] = LSH_GET_HASHBIT(alg_type);
|
||||
|
||||
for (size_t i = 0; i < NUM_STEPS / 2; i++)
|
||||
{
|
||||
//Mix
|
||||
load_sc(&const_v, i * 16);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
load_sc(&const_v, i * 16 + 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
}
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
lsh_err lsh256_ssse3_update(LSH256_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(data != NULLPTR);
|
||||
CRYPTOPP_ASSERT(databitlen % 8 == 0);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
|
||||
if (databitlen == 0){
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
// We are byte oriented. tail bits will always be 0.
|
||||
size_t databytelen = databitlen >> 3;
|
||||
// lsh_uint pos2 = databitlen & 0x7;
|
||||
const size_t pos2 = 0;
|
||||
|
||||
size_t remain_msg_byte = ctx->remain_databitlen >> 3;
|
||||
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
|
||||
const size_t remain_msg_bit = 0;
|
||||
|
||||
if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
|
||||
return LSH_ERR_INVALID_STATE;
|
||||
}
|
||||
if (remain_msg_bit > 0){
|
||||
return LSH_ERR_INVALID_DATABITLEN;
|
||||
}
|
||||
|
||||
if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN)
|
||||
{
|
||||
memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
|
||||
ctx->remain_databitlen += (lsh_uint)databitlen;
|
||||
remain_msg_byte += (lsh_uint)databytelen;
|
||||
if (pos2){
|
||||
ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
|
||||
}
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
if (remain_msg_byte > 0){
|
||||
size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte;
|
||||
memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
|
||||
compress(ctx, ctx->last_block);
|
||||
data += more_byte;
|
||||
databytelen -= more_byte;
|
||||
remain_msg_byte = 0;
|
||||
ctx->remain_databitlen = 0;
|
||||
}
|
||||
|
||||
while (databytelen >= LSH256_MSG_BLK_BYTE_LEN)
|
||||
{
|
||||
// This call to compress caused some trouble.
|
||||
// The data pointer can become unaligned in the
|
||||
// previous block.
|
||||
compress(ctx, data);
|
||||
data += LSH256_MSG_BLK_BYTE_LEN;
|
||||
databytelen -= LSH256_MSG_BLK_BYTE_LEN;
|
||||
}
|
||||
|
||||
if (databytelen > 0){
|
||||
memcpy(ctx->last_block, data, databytelen);
|
||||
ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
|
||||
}
|
||||
|
||||
if (pos2){
|
||||
ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
|
||||
ctx->remain_databitlen += pos2;
|
||||
}
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
lsh_err lsh256_ssse3_final(LSH256_SSSE3_Context* ctx, lsh_u8* hashval)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(hashval != NULLPTR);
|
||||
|
||||
// We are byte oriented. tail bits will always be 0.
|
||||
size_t remain_msg_byte = ctx->remain_databitlen >> 3;
|
||||
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
|
||||
const size_t remain_msg_bit = 0;
|
||||
|
||||
if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
|
||||
return LSH_ERR_INVALID_STATE;
|
||||
}
|
||||
|
||||
if (remain_msg_bit){
|
||||
ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
|
||||
}
|
||||
else{
|
||||
ctx->last_block[remain_msg_byte] = 0x80;
|
||||
}
|
||||
memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
|
||||
|
||||
compress(ctx, ctx->last_block);
|
||||
|
||||
fin(ctx);
|
||||
get_hash(ctx, hashval);
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
ANONYMOUS_NAMESPACE_END // Anonymous
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
extern
|
||||
void LSH256_Base_Restart_SSSE3(word32* state)
|
||||
{
|
||||
state[RemainingBits] = 0;
|
||||
LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh256_ssse3_init(&ctx);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_init failed");
|
||||
}
|
||||
|
||||
extern
|
||||
void LSH256_Base_Update_SSSE3(word32* state, const byte *input, size_t size)
|
||||
{
|
||||
LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh256_ssse3_update(&ctx, input, 8*size);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_update failed");
|
||||
}
|
||||
|
||||
extern
|
||||
void LSH256_Base_TruncatedFinal_SSSE3(word32* state, byte *hash, size_t)
|
||||
{
|
||||
LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh256_ssse3_final(&ctx, hash);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_final failed");
|
||||
}
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
#endif // CRYPTOPP_SSSE3_AVAILABLE
|
1100
lsh512.cpp
1100
lsh512.cpp
File diff suppressed because it is too large
Load Diff
762
lsh512_avx.cpp
Normal file
762
lsh512_avx.cpp
Normal file
@ -0,0 +1,762 @@
|
||||
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
|
||||
// Based on the specification and source code provided by
|
||||
// Korea Internet & Security Agency (KISA) website. Also
|
||||
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
|
||||
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
|
||||
|
||||
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
|
||||
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
|
||||
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
|
||||
// makes using zeroupper a little tricky.
|
||||
|
||||
#include "pch.h"
|
||||
#include "config.h"
|
||||
|
||||
#include "lsh.h"
|
||||
#include "misc.h"
|
||||
|
||||
#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
|
||||
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
# include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(CRYPTOPP_AVX2_AVAILABLE)
|
||||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && defined(__amd64__)
|
||||
# include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
/* LSH Constants */
|
||||
|
||||
const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256;
|
||||
// const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048;
|
||||
// const unsigned int LSH512_CV_BYTE_LEN = 128;
|
||||
const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64;
|
||||
|
||||
// const unsigned int MSG_BLK_WORD_LEN = 32;
|
||||
const unsigned int CV_WORD_LEN = 16;
|
||||
const unsigned int CONST_WORD_LEN = 8;
|
||||
const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
|
||||
const unsigned int NUM_STEPS = 28;
|
||||
|
||||
const unsigned int ROT_EVEN_ALPHA = 23;
|
||||
const unsigned int ROT_EVEN_BETA = 59;
|
||||
const unsigned int ROT_ODD_ALPHA = 7;
|
||||
const unsigned int ROT_ODD_BETA = 3;
|
||||
|
||||
const unsigned int LSH_TYPE_512_512 = 0x0010040;
|
||||
const unsigned int LSH_TYPE_512_384 = 0x0010030;
|
||||
const unsigned int LSH_TYPE_512_256 = 0x0010020;
|
||||
const unsigned int LSH_TYPE_512_224 = 0x001001C;
|
||||
|
||||
// const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384;
|
||||
// const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512;
|
||||
|
||||
/* Error Code */
|
||||
|
||||
const unsigned int LSH_SUCCESS = 0x0;
|
||||
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
|
||||
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
|
||||
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
|
||||
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
|
||||
|
||||
/* Index into our state array */
|
||||
|
||||
const unsigned int AlgorithmType = 80;
|
||||
const unsigned int RemainingBits = 81;
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
NAMESPACE_BEGIN(LSH)
|
||||
|
||||
// lsh512.cpp
|
||||
extern const word64 LSH512_IV224[CV_WORD_LEN];
|
||||
extern const word64 LSH512_IV256[CV_WORD_LEN];
|
||||
extern const word64 LSH512_IV384[CV_WORD_LEN];
|
||||
extern const word64 LSH512_IV512[CV_WORD_LEN];
|
||||
extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS];
|
||||
|
||||
NAMESPACE_END // LSH
|
||||
NAMESPACE_END // Crypto++
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
using CryptoPP::byte;
|
||||
using CryptoPP::word32;
|
||||
using CryptoPP::word64;
|
||||
using CryptoPP::rotlFixed;
|
||||
using CryptoPP::rotlConstant;
|
||||
|
||||
using CryptoPP::GetBlock;
|
||||
using CryptoPP::LittleEndian;
|
||||
using CryptoPP::ConditionalByteReverse;
|
||||
using CryptoPP::LITTLE_ENDIAN_ORDER;
|
||||
|
||||
using CryptoPP::LSH::LSH512_IV224;
|
||||
using CryptoPP::LSH::LSH512_IV256;
|
||||
using CryptoPP::LSH::LSH512_IV384;
|
||||
using CryptoPP::LSH::LSH512_IV512;
|
||||
using CryptoPP::LSH::LSH512_StepConstants;
|
||||
|
||||
typedef byte lsh_u8;
|
||||
typedef word32 lsh_u32;
|
||||
typedef word64 lsh_u64;
|
||||
typedef word32 lsh_uint;
|
||||
typedef word32 lsh_err;
|
||||
typedef word32 lsh_type;
|
||||
|
||||
struct LSH512_AVX2_Context
|
||||
{
|
||||
LSH512_AVX2_Context(word64* state, word64 algType, word64& remainingBitLength) :
|
||||
cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
|
||||
last_block(reinterpret_cast<byte*>(state+48)),
|
||||
remain_databitlen(remainingBitLength),
|
||||
alg_type(static_cast<lsh_type>(algType)) {}
|
||||
|
||||
lsh_u64* cv_l; // start of our state block
|
||||
lsh_u64* cv_r;
|
||||
lsh_u64* sub_msgs;
|
||||
lsh_u8* last_block;
|
||||
lsh_u64& remain_databitlen;
|
||||
lsh_type alg_type;
|
||||
};
|
||||
|
||||
struct LSH512_AVX2_Internal
|
||||
{
|
||||
LSH512_AVX2_Internal(word64* state) :
|
||||
submsg_e_l(state+16), submsg_e_r(state+24),
|
||||
submsg_o_l(state+32), submsg_o_r(state+40) { }
|
||||
|
||||
lsh_u64* submsg_e_l; /* even left sub-message */
|
||||
lsh_u64* submsg_e_r; /* even right sub-message */
|
||||
lsh_u64* submsg_o_l; /* odd left sub-message */
|
||||
lsh_u64* submsg_o_r; /* odd right sub-message */
|
||||
};
|
||||
|
||||
// Zero the upper 128 bits of all YMM registers on exit.
|
||||
// It avoids AVX state transition penalties when saving state.
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
|
||||
// makes using zeroupper a little tricky.
|
||||
|
||||
struct AVX_Cleanup
|
||||
{
|
||||
~AVX_Cleanup() {
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
};
|
||||
|
||||
// const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
|
||||
|
||||
/* LSH AlgType Macro */
|
||||
|
||||
inline bool LSH_IS_LSH512(lsh_uint val) {
|
||||
return (val & 0xf0000) == 0x10000;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
|
||||
return val >> 24;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
|
||||
return val & 0xffff;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
|
||||
return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
|
||||
}
|
||||
|
||||
inline lsh_u64 loadLE64(lsh_u64 v) {
|
||||
return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
|
||||
}
|
||||
|
||||
lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
|
||||
return rotlFixed(x, r);
|
||||
}
|
||||
|
||||
// Original code relied upon unaligned lsh_u64 buffer
|
||||
inline void load_msg_blk(LSH512_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN])
|
||||
{
|
||||
lsh_u64* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u64* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u64* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u64* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_l+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_l+4),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_r+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_r+4),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_l+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+128)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_l+4),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+160)));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_r+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+192)));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_r+4),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(msgblk+224)));
|
||||
}
|
||||
|
||||
inline void msg_exp_even(LSH512_AVX2_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u64* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u64* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u64* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u64* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
|
||||
_mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
|
||||
_MM_SHUFFLE(1,0,2,3))));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_l+4), _mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
|
||||
_mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
|
||||
_MM_SHUFFLE(2,1,0,3))));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
|
||||
_mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
|
||||
_MM_SHUFFLE(1,0,2,3))));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_e_r+4), _mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
|
||||
_mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
|
||||
_MM_SHUFFLE(2,1,0,3))));
|
||||
}
|
||||
|
||||
inline void msg_exp_odd(LSH512_AVX2_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u64* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u64* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u64* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u64* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_l+0),
|
||||
_mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
|
||||
_mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
|
||||
_MM_SHUFFLE(1,0,2,3))));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_l+4),
|
||||
_mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
|
||||
_mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
|
||||
_MM_SHUFFLE(2,1,0,3))));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_r+0),
|
||||
_mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
|
||||
_mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
|
||||
_MM_SHUFFLE(1,0,2,3))));
|
||||
_mm256_storeu_si256(M256_CAST(submsg_o_r+4),
|
||||
_mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
|
||||
_mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
|
||||
_MM_SHUFFLE(2,1,0,3))));
|
||||
}
|
||||
|
||||
inline void load_sc(const lsh_u64** p_const_v, size_t i)
|
||||
{
|
||||
*p_const_v = &LSH512_StepConstants[i];
|
||||
}
|
||||
|
||||
inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u64* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u64* submsg_e_r = i_state->submsg_e_r;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l))));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r))));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4))));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4))));
|
||||
}
|
||||
|
||||
inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u64* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u64* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4))));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4))));
|
||||
}
|
||||
|
||||
inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
|
||||
{
|
||||
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r))));
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_add_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4))));
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline void rotate_blk(lsh_u64 cv[8])
|
||||
{
|
||||
_mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
|
||||
_mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
|
||||
_mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
|
||||
_mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
|
||||
_mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
|
||||
_mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
|
||||
}
|
||||
|
||||
inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8])
|
||||
{
|
||||
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(const_v))));
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(const_v+4))));
|
||||
}
|
||||
|
||||
inline void rotate_msg_gamma(lsh_u64 cv_r[8])
|
||||
{
|
||||
// g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0),
|
||||
_mm256_shuffle_epi8(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
|
||||
_mm256_set_epi8(
|
||||
/* hi lane */ 9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4,
|
||||
/* lo lane */ 13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+4),
|
||||
_mm256_shuffle_epi8(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
|
||||
_mm256_set_epi8(
|
||||
/* hi lane */ 8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3,
|
||||
/* lo lane */ 12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7)));
|
||||
}
|
||||
|
||||
inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
|
||||
{
|
||||
__m256i temp[2];
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_permute4x64_epi64(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
|
||||
|
||||
temp[0] = _mm256_loadu_si256(CONST_M256_CAST(cv_l+0));
|
||||
temp[1] = _mm256_loadu_si256(CONST_M256_CAST(cv_r+0));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+0),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+4),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0), temp[0]);
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+4), temp[1]);
|
||||
};
|
||||
|
||||
/* -------------------------------------------------------- *
|
||||
* step function
|
||||
* -------------------------------------------------------- */
|
||||
|
||||
template <unsigned int Alpha, unsigned int Beta>
|
||||
inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8])
|
||||
{
|
||||
add_blk(cv_l, cv_r);
|
||||
rotate_blk<Alpha>(cv_l);
|
||||
xor_with_const(cv_l, const_v);
|
||||
add_blk(cv_r, cv_l);
|
||||
rotate_blk<Beta>(cv_r);
|
||||
add_blk(cv_l, cv_r);
|
||||
rotate_msg_gamma(cv_r);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- *
|
||||
* compression function
|
||||
* -------------------------------------------------------- */
|
||||
|
||||
inline void compress(LSH512_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN])
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
LSH512_AVX2_Internal s_state(ctx->cv_l);
|
||||
LSH512_AVX2_Internal* i_state = &s_state;
|
||||
|
||||
const lsh_u64* const_v = NULL;
|
||||
lsh_u64 *cv_l = ctx->cv_l;
|
||||
lsh_u64 *cv_r = ctx->cv_r;
|
||||
|
||||
load_msg_blk(i_state, pdMsgBlk);
|
||||
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 0);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
msg_add_odd(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
for (size_t i = 1; i < NUM_STEPS / 2; i++)
|
||||
{
|
||||
msg_exp_even(i_state);
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 16 * i);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
msg_exp_odd(i_state);
|
||||
msg_add_odd(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 16 * i + 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
}
|
||||
|
||||
msg_exp_even(i_state);
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16])
|
||||
{
|
||||
// The IV's are 32-byte aligned so we can use aligned loads.
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+0),
|
||||
_mm256_load_si256(CONST_M256_CAST(iv+0)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+4),
|
||||
_mm256_load_si256(CONST_M256_CAST(iv+4)));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0),
|
||||
_mm256_load_si256(CONST_M256_CAST(iv+8)));
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+4),
|
||||
_mm256_load_si256(CONST_M256_CAST(iv+12)));
|
||||
}
|
||||
|
||||
inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
|
||||
{
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_setzero_si256());
|
||||
}
|
||||
|
||||
inline void zero_submsgs(LSH512_AVX2_Context* ctx)
|
||||
{
|
||||
lsh_u64* sub_msgs = ctx->sub_msgs;
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(sub_msgs+ 0),
|
||||
_mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(sub_msgs+ 4),
|
||||
_mm256_setzero_si256());
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(sub_msgs+ 8),
|
||||
_mm256_setzero_si256());
|
||||
_mm256_storeu_si256(M256_CAST(sub_msgs+12),
|
||||
_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
inline void init224(LSH512_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224);
|
||||
}
|
||||
|
||||
inline void init256(LSH512_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256);
|
||||
}
|
||||
|
||||
inline void init384(LSH512_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384);
|
||||
}
|
||||
|
||||
inline void init512(LSH512_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void fin(LSH512_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
|
||||
|
||||
_mm256_storeu_si256(M256_CAST(ctx->cv_l+4), _mm256_xor_si256(
|
||||
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+4)),
|
||||
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+4))));
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void get_hash(LSH512_AVX2_Context* ctx, lsh_u8* pbHashVal)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
|
||||
|
||||
lsh_uint alg_type = ctx->alg_type;
|
||||
lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
|
||||
lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
|
||||
|
||||
// Multiplying by sizeof(lsh_u8) looks odd...
|
||||
memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
|
||||
if (hash_val_bit_len){
|
||||
pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
|
||||
}
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
lsh_err lsh512_init_avx2(LSH512_AVX2_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
|
||||
lsh_u32 alg_type = ctx->alg_type;
|
||||
const lsh_u64* const_v = NULL;
|
||||
ctx->remain_databitlen = 0;
|
||||
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
|
||||
AVX_Cleanup cleanup;
|
||||
|
||||
switch (alg_type){
|
||||
case LSH_TYPE_512_512:
|
||||
init512(ctx);
|
||||
return LSH_SUCCESS;
|
||||
case LSH_TYPE_512_384:
|
||||
init384(ctx);
|
||||
return LSH_SUCCESS;
|
||||
case LSH_TYPE_512_256:
|
||||
init256(ctx);
|
||||
return LSH_SUCCESS;
|
||||
case LSH_TYPE_512_224:
|
||||
init224(ctx);
|
||||
return LSH_SUCCESS;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
lsh_u64* cv_l = ctx->cv_l;
|
||||
lsh_u64* cv_r = ctx->cv_r;
|
||||
|
||||
zero_iv(cv_l, cv_r);
|
||||
cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN;
|
||||
cv_l[1] = LSH_GET_HASHBIT(alg_type);
|
||||
|
||||
for (size_t i = 0; i < NUM_STEPS / 2; i++)
|
||||
{
|
||||
//Mix
|
||||
load_sc(&const_v, i * 16);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
load_sc(&const_v, i * 16 + 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
}
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
lsh_err lsh512_update_avx2(LSH512_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(data != NULLPTR);
|
||||
CRYPTOPP_ASSERT(databitlen % 8 == 0);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
|
||||
AVX_Cleanup cleanup;
|
||||
|
||||
if (databitlen == 0){
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
// We are byte oriented. tail bits will always be 0.
|
||||
size_t databytelen = databitlen >> 3;
|
||||
// lsh_uint pos2 = databitlen & 0x7;
|
||||
const size_t pos2 = 0;
|
||||
|
||||
size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
|
||||
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
|
||||
const size_t remain_msg_bit = 0;
|
||||
|
||||
if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
|
||||
return LSH_ERR_INVALID_STATE;
|
||||
}
|
||||
if (remain_msg_bit > 0){
|
||||
return LSH_ERR_INVALID_DATABITLEN;
|
||||
}
|
||||
|
||||
if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){
|
||||
memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
|
||||
ctx->remain_databitlen += (lsh_uint)databitlen;
|
||||
remain_msg_byte += (lsh_uint)databytelen;
|
||||
if (pos2){
|
||||
ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
|
||||
}
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
if (remain_msg_byte > 0){
|
||||
size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte;
|
||||
memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
|
||||
compress(ctx, ctx->last_block);
|
||||
data += more_byte;
|
||||
databytelen -= more_byte;
|
||||
remain_msg_byte = 0;
|
||||
ctx->remain_databitlen = 0;
|
||||
}
|
||||
|
||||
while (databytelen >= LSH512_MSG_BLK_BYTE_LEN)
|
||||
{
|
||||
// This call to compress caused some trouble.
|
||||
// The data pointer can become unaligned in the
|
||||
// previous block.
|
||||
compress(ctx, data);
|
||||
data += LSH512_MSG_BLK_BYTE_LEN;
|
||||
databytelen -= LSH512_MSG_BLK_BYTE_LEN;
|
||||
}
|
||||
|
||||
if (databytelen > 0){
|
||||
memcpy(ctx->last_block, data, databytelen);
|
||||
ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
|
||||
}
|
||||
|
||||
if (pos2){
|
||||
ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
|
||||
ctx->remain_databitlen += pos2;
|
||||
}
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
lsh_err lsh512_final_avx2(LSH512_AVX2_Context* ctx, lsh_u8* hashval)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(hashval != NULLPTR);
|
||||
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
|
||||
AVX_Cleanup cleanup;
|
||||
|
||||
// We are byte oriented. tail bits will always be 0.
|
||||
size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
|
||||
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
|
||||
const size_t remain_msg_bit = 0;
|
||||
|
||||
if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
|
||||
return LSH_ERR_INVALID_STATE;
|
||||
}
|
||||
|
||||
if (remain_msg_bit){
|
||||
ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
|
||||
}
|
||||
else{
|
||||
ctx->last_block[remain_msg_byte] = 0x80;
|
||||
}
|
||||
memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
|
||||
|
||||
compress(ctx, ctx->last_block);
|
||||
|
||||
fin(ctx);
|
||||
get_hash(ctx, hashval);
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
extern
|
||||
void LSH512_Base_Restart_AVX2(word64* state)
|
||||
{
|
||||
state[RemainingBits] = 0;
|
||||
LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh512_init_avx2(&ctx);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_avx2 failed");
|
||||
}
|
||||
|
||||
extern
|
||||
void LSH512_Base_Update_AVX2(word64* state, const byte *input, size_t size)
|
||||
{
|
||||
LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh512_update_avx2(&ctx, input, 8*size);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_avx2 failed");
|
||||
}
|
||||
|
||||
extern
|
||||
void LSH512_Base_TruncatedFinal_AVX2(word64* state, byte *hash, size_t)
|
||||
{
|
||||
LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh512_final_avx2(&ctx, hash);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_avx2 failed");
|
||||
}
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
#endif // CRYPTOPP_AVX2_AVAILABLE
|
937
lsh512_sse.cpp
Normal file
937
lsh512_sse.cpp
Normal file
@ -0,0 +1,937 @@
|
||||
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
|
||||
// Based on the specification and source code provided by
|
||||
// Korea Internet & Security Agency (KISA) website. Also
|
||||
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
|
||||
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
|
||||
|
||||
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
|
||||
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
|
||||
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
|
||||
// makes using zeroupper a little tricky.
|
||||
|
||||
#include "pch.h"
|
||||
#include "config.h"
|
||||
|
||||
#include "lsh.h"
|
||||
#include "misc.h"
|
||||
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
|
||||
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
# include <emmintrin.h>
|
||||
# include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(CRYPTOPP_XOP_AVAILABLE)
|
||||
# include <ammintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && defined(__amd64__)
|
||||
# include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
/* LSH Constants */
|
||||
|
||||
const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256;
|
||||
// const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048;
|
||||
// const unsigned int LSH512_CV_BYTE_LEN = 128;
|
||||
const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64;
|
||||
|
||||
// const unsigned int MSG_BLK_WORD_LEN = 32;
|
||||
const unsigned int CV_WORD_LEN = 16;
|
||||
const unsigned int CONST_WORD_LEN = 8;
|
||||
const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
|
||||
const unsigned int NUM_STEPS = 28;
|
||||
|
||||
const unsigned int ROT_EVEN_ALPHA = 23;
|
||||
const unsigned int ROT_EVEN_BETA = 59;
|
||||
const unsigned int ROT_ODD_ALPHA = 7;
|
||||
const unsigned int ROT_ODD_BETA = 3;
|
||||
|
||||
const unsigned int LSH_TYPE_512_512 = 0x0010040;
|
||||
const unsigned int LSH_TYPE_512_384 = 0x0010030;
|
||||
const unsigned int LSH_TYPE_512_256 = 0x0010020;
|
||||
const unsigned int LSH_TYPE_512_224 = 0x001001C;
|
||||
|
||||
// const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384;
|
||||
// const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512;
|
||||
|
||||
/* Error Code */
|
||||
|
||||
const unsigned int LSH_SUCCESS = 0x0;
|
||||
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
|
||||
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
|
||||
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
|
||||
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
|
||||
|
||||
/* Index into our state array */
|
||||
|
||||
const unsigned int AlgorithmType = 80;
|
||||
const unsigned int RemainingBits = 81;
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
NAMESPACE_BEGIN(LSH)
|
||||
|
||||
// lsh512.cpp
|
||||
extern const word64 LSH512_IV224[CV_WORD_LEN];
|
||||
extern const word64 LSH512_IV256[CV_WORD_LEN];
|
||||
extern const word64 LSH512_IV384[CV_WORD_LEN];
|
||||
extern const word64 LSH512_IV512[CV_WORD_LEN];
|
||||
extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS];
|
||||
|
||||
NAMESPACE_END // LSH
|
||||
NAMESPACE_END // Crypto++
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
using CryptoPP::byte;
|
||||
using CryptoPP::word32;
|
||||
using CryptoPP::word64;
|
||||
using CryptoPP::rotlFixed;
|
||||
using CryptoPP::rotlConstant;
|
||||
|
||||
using CryptoPP::GetBlock;
|
||||
using CryptoPP::LittleEndian;
|
||||
using CryptoPP::ConditionalByteReverse;
|
||||
using CryptoPP::LITTLE_ENDIAN_ORDER;
|
||||
|
||||
using CryptoPP::LSH::LSH512_IV224;
|
||||
using CryptoPP::LSH::LSH512_IV256;
|
||||
using CryptoPP::LSH::LSH512_IV384;
|
||||
using CryptoPP::LSH::LSH512_IV512;
|
||||
using CryptoPP::LSH::LSH512_StepConstants;
|
||||
|
||||
typedef byte lsh_u8;
|
||||
typedef word32 lsh_u32;
|
||||
typedef word64 lsh_u64;
|
||||
typedef word32 lsh_uint;
|
||||
typedef word32 lsh_err;
|
||||
typedef word32 lsh_type;
|
||||
|
||||
struct LSH512_SSSE3_Context
|
||||
{
|
||||
LSH512_SSSE3_Context(word64* state, word64 algType, word64& remainingBitLength) :
|
||||
cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
|
||||
last_block(reinterpret_cast<byte*>(state+48)),
|
||||
remain_databitlen(remainingBitLength),
|
||||
alg_type(static_cast<lsh_type>(algType)) {}
|
||||
|
||||
lsh_u64* cv_l; // start of our state block
|
||||
lsh_u64* cv_r;
|
||||
lsh_u64* sub_msgs;
|
||||
lsh_u8* last_block;
|
||||
lsh_u64& remain_databitlen;
|
||||
lsh_type alg_type;
|
||||
};
|
||||
|
||||
struct LSH512_SSSE3_Internal
|
||||
{
|
||||
LSH512_SSSE3_Internal(word64* state) :
|
||||
submsg_e_l(state+16), submsg_e_r(state+24),
|
||||
submsg_o_l(state+32), submsg_o_r(state+40) { }
|
||||
|
||||
lsh_u64* submsg_e_l; /* even left sub-message */
|
||||
lsh_u64* submsg_e_r; /* even right sub-message */
|
||||
lsh_u64* submsg_o_l; /* odd left sub-message */
|
||||
lsh_u64* submsg_o_r; /* odd right sub-message */
|
||||
};
|
||||
|
||||
const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
|
||||
|
||||
/* LSH AlgType Macro */
|
||||
|
||||
inline bool LSH_IS_LSH512(lsh_uint val) {
|
||||
return (val & 0xf0000) == 0x10000;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
|
||||
return val >> 24;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
|
||||
return val & 0xffff;
|
||||
}
|
||||
|
||||
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
|
||||
return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
|
||||
}
|
||||
|
||||
inline lsh_u64 loadLE64(lsh_u64 v) {
|
||||
return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
|
||||
}
|
||||
|
||||
lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
|
||||
return rotlFixed(x, r);
|
||||
}
|
||||
|
||||
// Original code relied upon unaligned lsh_u64 buffer
|
||||
inline void load_msg_blk(LSH512_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN])
|
||||
{
|
||||
lsh_u64* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u64* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u64* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u64* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+0)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+2),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+16)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+32)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+6),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+48)));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+64)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+2),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+80)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+96)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+6),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+112)));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+128)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+2),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+144)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+160)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+6),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+176)));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+192)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+2),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+208)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+224)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+6),
|
||||
_mm_loadu_si128(CONST_M128_CAST(msgblk+240)));
|
||||
}
|
||||
|
||||
inline void msg_exp_even(LSH512_SSSE3_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u64* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u64* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u64* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u64* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
__m128i temp;
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+2), temp);
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_unpacklo_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_unpackhi_epi64(
|
||||
temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+2), temp);
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_unpacklo_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_unpackhi_epi64(
|
||||
temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6))));
|
||||
}
|
||||
|
||||
inline void msg_exp_odd(LSH512_SSSE3_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u64* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u64* submsg_e_r = i_state->submsg_e_r;
|
||||
lsh_u64* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u64* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
__m128i temp;
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+2), temp);
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_unpacklo_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_unpackhi_epi64(
|
||||
temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+2), temp);
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_unpacklo_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_unpackhi_epi64(
|
||||
temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6))));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
|
||||
_mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6))));
|
||||
}
|
||||
|
||||
inline void load_sc(const lsh_u64** p_const_v, size_t i)
|
||||
{
|
||||
*p_const_v = &LSH512_StepConstants[i];
|
||||
}
|
||||
|
||||
inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_SSSE3_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u64* submsg_e_l = i_state->submsg_e_l;
|
||||
lsh_u64* submsg_e_r = i_state->submsg_e_r;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6))));
|
||||
}
|
||||
|
||||
inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_SSSE3_Internal* i_state)
|
||||
{
|
||||
CRYPTOPP_ASSERT(i_state != NULLPTR);
|
||||
|
||||
lsh_u64* submsg_o_l = i_state->submsg_o_l;
|
||||
lsh_u64* submsg_o_r = i_state->submsg_o_r;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6))));
|
||||
}
|
||||
|
||||
inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
|
||||
{
|
||||
_mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+2))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_add_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+6))));
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline void rotate_blk(lsh_u64 cv[8])
|
||||
{
|
||||
#if defined(CRYPTOPP_XOP_AVAILABLE)
|
||||
_mm_storeu_si128(M128_CAST(cv),
|
||||
_mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R));
|
||||
_mm_storeu_si128(M128_CAST(cv+2),
|
||||
_mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R));
|
||||
_mm_storeu_si128(M128_CAST(cv+4),
|
||||
_mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R));
|
||||
_mm_storeu_si128(M128_CAST(cv+6),
|
||||
_mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R));
|
||||
|
||||
#else
|
||||
_mm_storeu_si128(M128_CAST(cv), _mm_or_si128(
|
||||
_mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R),
|
||||
_mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), 64-R)));
|
||||
_mm_storeu_si128(M128_CAST(cv+2), _mm_or_si128(
|
||||
_mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R),
|
||||
_mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), 64-R)));
|
||||
_mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128(
|
||||
_mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R),
|
||||
_mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 64-R)));
|
||||
_mm_storeu_si128(M128_CAST(cv+6), _mm_or_si128(
|
||||
_mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R),
|
||||
_mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), 64-R)));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8])
|
||||
{
|
||||
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(const_v))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(const_v+2))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(const_v+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(const_v+6))));
|
||||
}
|
||||
|
||||
inline void rotate_msg_gamma(lsh_u64 cv_r[8])
|
||||
{
|
||||
// g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0),
|
||||
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
|
||||
_mm_set_epi8(13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+2),
|
||||
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+2)),
|
||||
_mm_set_epi8(9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4)));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4),
|
||||
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
|
||||
_mm_set_epi8(12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+6),
|
||||
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+6)),
|
||||
_mm_set_epi8(8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3)));
|
||||
}
|
||||
|
||||
inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
|
||||
{
|
||||
__m128i temp[2];
|
||||
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_unpacklo_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+0))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_unpackhi_epi64(
|
||||
temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+2))));
|
||||
|
||||
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+4));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_unpacklo_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4))));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_unpackhi_epi64(
|
||||
temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+6))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+0));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_unpacklo_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+2))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_unpackhi_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)), temp[0]));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_shuffle_epi32(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)), _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+4));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_unpacklo_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+6))));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_unpackhi_epi64(
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)), temp[0]));
|
||||
|
||||
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0));
|
||||
temp[1] = _mm_loadu_si128(CONST_M128_CAST(cv_l+2));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(cv_l+0),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+2),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+6),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+6),
|
||||
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)));
|
||||
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0), temp[0]);
|
||||
_mm_storeu_si128(M128_CAST(cv_r+2), temp[1]);
|
||||
};
|
||||
|
||||
/* -------------------------------------------------------- *
|
||||
* step function
|
||||
* -------------------------------------------------------- */
|
||||
|
||||
template <unsigned int Alpha, unsigned int Beta>
|
||||
inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8])
|
||||
{
|
||||
add_blk(cv_l, cv_r);
|
||||
rotate_blk<Alpha>(cv_l);
|
||||
xor_with_const(cv_l, const_v);
|
||||
add_blk(cv_r, cv_l);
|
||||
rotate_blk<Beta>(cv_r);
|
||||
add_blk(cv_l, cv_r);
|
||||
rotate_msg_gamma(cv_r);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- *
|
||||
* compression function
|
||||
* -------------------------------------------------------- */
|
||||
|
||||
inline void compress(LSH512_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN])
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
LSH512_SSSE3_Internal s_state(ctx->cv_l);
|
||||
LSH512_SSSE3_Internal* i_state = &s_state;
|
||||
|
||||
const lsh_u64* const_v = NULL;
|
||||
lsh_u64 *cv_l = ctx->cv_l;
|
||||
lsh_u64 *cv_r = ctx->cv_r;
|
||||
|
||||
load_msg_blk(i_state, pdMsgBlk);
|
||||
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 0);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
msg_add_odd(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
for (size_t i = 1; i < NUM_STEPS / 2; i++)
|
||||
{
|
||||
msg_exp_even(i_state);
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 16 * i);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
msg_exp_odd(i_state);
|
||||
msg_add_odd(cv_l, cv_r, i_state);
|
||||
load_sc(&const_v, 16 * i + 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
}
|
||||
|
||||
msg_exp_even(i_state);
|
||||
msg_add_even(cv_l, cv_r, i_state);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16])
|
||||
{
|
||||
// The IV's are 32-byte aligned so we can use aligned loads.
|
||||
_mm_storeu_si128(M128_CAST(cv_l+0),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+0)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+2),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+2)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+4)));
|
||||
_mm_storeu_si128(M128_CAST(cv_l+6),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+6)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+8)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+2),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+10)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+12)));
|
||||
_mm_storeu_si128(M128_CAST(cv_r+6),
|
||||
_mm_load_si128(CONST_M128_CAST(iv+14)));
|
||||
}
|
||||
|
||||
inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
|
||||
{
|
||||
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_setzero_si128());
|
||||
}
|
||||
|
||||
inline void zero_submsgs(LSH512_SSSE3_Context* ctx)
|
||||
{
|
||||
lsh_u64* sub_msgs = ctx->sub_msgs;
|
||||
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+ 0),
|
||||
_mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+ 2),
|
||||
_mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+ 4),
|
||||
_mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+ 6),
|
||||
_mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+ 8),
|
||||
_mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+10),
|
||||
_mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+12),
|
||||
_mm_setzero_si128());
|
||||
_mm_storeu_si128(M128_CAST(sub_msgs+14),
|
||||
_mm_setzero_si128());
|
||||
}
|
||||
|
||||
inline void init224(LSH512_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224);
|
||||
}
|
||||
|
||||
inline void init256(LSH512_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256);
|
||||
}
|
||||
|
||||
inline void init384(LSH512_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384);
|
||||
}
|
||||
|
||||
inline void init512(LSH512_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
zero_submsgs(ctx);
|
||||
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void fin(LSH512_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
|
||||
_mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0))));
|
||||
_mm_storeu_si128(M128_CAST(ctx->cv_l+2), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+2)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+2))));
|
||||
_mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4))));
|
||||
_mm_storeu_si128(M128_CAST(ctx->cv_l+6), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+6)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+6))));
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
inline void get_hash(LSH512_SSSE3_Context* ctx, lsh_u8* pbHashVal)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
|
||||
|
||||
lsh_uint alg_type = ctx->alg_type;
|
||||
lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
|
||||
lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
|
||||
|
||||
// Multiplying by sizeof(lsh_u8) looks odd...
|
||||
memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
|
||||
if (hash_val_bit_len){
|
||||
pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
|
||||
}
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------- */
|
||||
|
||||
lsh_err lsh512_init_ssse3(LSH512_SSSE3_Context* ctx)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
|
||||
lsh_u32 alg_type = ctx->alg_type;
|
||||
const lsh_u64* const_v = NULL;
|
||||
ctx->remain_databitlen = 0;
|
||||
|
||||
switch (alg_type){
|
||||
case LSH_TYPE_512_512:
|
||||
init512(ctx);
|
||||
return LSH_SUCCESS;
|
||||
case LSH_TYPE_512_384:
|
||||
init384(ctx);
|
||||
return LSH_SUCCESS;
|
||||
case LSH_TYPE_512_256:
|
||||
init256(ctx);
|
||||
return LSH_SUCCESS;
|
||||
case LSH_TYPE_512_224:
|
||||
init224(ctx);
|
||||
return LSH_SUCCESS;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
lsh_u64* cv_l = ctx->cv_l;
|
||||
lsh_u64* cv_r = ctx->cv_r;
|
||||
|
||||
zero_iv(cv_l, cv_r);
|
||||
cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN;
|
||||
cv_l[1] = LSH_GET_HASHBIT(alg_type);
|
||||
|
||||
for (size_t i = 0; i < NUM_STEPS / 2; i++)
|
||||
{
|
||||
//Mix
|
||||
load_sc(&const_v, i * 16);
|
||||
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
|
||||
load_sc(&const_v, i * 16 + 8);
|
||||
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
|
||||
word_perm(cv_l, cv_r);
|
||||
}
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
lsh_err lsh512_update_ssse3(LSH512_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(data != NULLPTR);
|
||||
CRYPTOPP_ASSERT(databitlen % 8 == 0);
|
||||
CRYPTOPP_ASSERT(ctx->alg_type != 0);
|
||||
|
||||
if (databitlen == 0){
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
// We are byte oriented. tail bits will always be 0.
|
||||
size_t databytelen = databitlen >> 3;
|
||||
// lsh_uint pos2 = databitlen & 0x7;
|
||||
const size_t pos2 = 0;
|
||||
|
||||
size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
|
||||
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
|
||||
const size_t remain_msg_bit = 0;
|
||||
|
||||
if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
|
||||
return LSH_ERR_INVALID_STATE;
|
||||
}
|
||||
if (remain_msg_bit > 0){
|
||||
return LSH_ERR_INVALID_DATABITLEN;
|
||||
}
|
||||
|
||||
if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){
|
||||
memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
|
||||
ctx->remain_databitlen += (lsh_uint)databitlen;
|
||||
remain_msg_byte += (lsh_uint)databytelen;
|
||||
if (pos2){
|
||||
ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
|
||||
}
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
if (remain_msg_byte > 0){
|
||||
size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte;
|
||||
memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
|
||||
compress(ctx, ctx->last_block);
|
||||
data += more_byte;
|
||||
databytelen -= more_byte;
|
||||
remain_msg_byte = 0;
|
||||
ctx->remain_databitlen = 0;
|
||||
}
|
||||
|
||||
while (databytelen >= LSH512_MSG_BLK_BYTE_LEN)
|
||||
{
|
||||
// This call to compress caused some trouble.
|
||||
// The data pointer can become unaligned in the
|
||||
// previous block.
|
||||
compress(ctx, data);
|
||||
data += LSH512_MSG_BLK_BYTE_LEN;
|
||||
databytelen -= LSH512_MSG_BLK_BYTE_LEN;
|
||||
}
|
||||
|
||||
if (databytelen > 0){
|
||||
memcpy(ctx->last_block, data, databytelen);
|
||||
ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
|
||||
}
|
||||
|
||||
if (pos2){
|
||||
ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
|
||||
ctx->remain_databitlen += pos2;
|
||||
}
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
lsh_err lsh512_final_ssse3(LSH512_SSSE3_Context* ctx, lsh_u8* hashval)
|
||||
{
|
||||
CRYPTOPP_ASSERT(ctx != NULLPTR);
|
||||
CRYPTOPP_ASSERT(hashval != NULLPTR);
|
||||
|
||||
// We are byte oriented. tail bits will always be 0.
|
||||
size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
|
||||
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
|
||||
const size_t remain_msg_bit = 0;
|
||||
|
||||
if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
|
||||
return LSH_ERR_INVALID_STATE;
|
||||
}
|
||||
|
||||
if (remain_msg_bit){
|
||||
ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
|
||||
}
|
||||
else{
|
||||
ctx->last_block[remain_msg_byte] = 0x80;
|
||||
}
|
||||
memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
|
||||
|
||||
compress(ctx, ctx->last_block);
|
||||
|
||||
fin(ctx);
|
||||
get_hash(ctx, hashval);
|
||||
|
||||
return LSH_SUCCESS;
|
||||
}
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
extern
|
||||
void LSH512_Base_Restart_SSSE3(word64* state)
|
||||
{
|
||||
state[RemainingBits] = 0;
|
||||
LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh512_init_ssse3(&ctx);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_ssse3 failed");
|
||||
}
|
||||
|
||||
extern
|
||||
void LSH512_Base_Update_SSSE3(word64* state, const byte *input, size_t size)
|
||||
{
|
||||
LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh512_update_ssse3(&ctx, input, 8*size);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_ssse3 failed");
|
||||
}
|
||||
|
||||
extern
|
||||
void LSH512_Base_TruncatedFinal_SSSE3(word64* state, byte *hash, size_t)
|
||||
{
|
||||
LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
|
||||
lsh_err err = lsh512_final_ssse3(&ctx, hash);
|
||||
|
||||
if (err != LSH_SUCCESS)
|
||||
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_ssse3 failed");
|
||||
}
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
#endif // CRYPTOPP_SSSE3_AVAILABLE
|
Loading…
Reference in New Issue
Block a user