diff --git a/Filelist.txt b/Filelist.txt index f916315f..b0f36525 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -205,7 +205,11 @@ lea.cpp lea_simd.cpp lea.h lsh256.cpp +lsh256_sse.cpp +lsh256_avx.cpp lsh512.cpp +lsh512_sse.cpp +lsh512_avx.cpp lsh.h luc.cpp luc.h diff --git a/GNUmakefile b/GNUmakefile index eee39c15..3491ef4d 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -122,7 +122,9 @@ else ifeq ($(findstring clean,$(MAKECMDGOALS)),clean) DETECT_FEATURES := 0 else ifeq ($(findstring distclean,$(MAKECMDGOALS)),distclean) DETECT_FEATURES := 0 -else ifeq ($(findstring distclean,$(MAKECMDGOALS)),trim) +else ifeq ($(findstring trim,$(MAKECMDGOALS)),trim) + DETECT_FEATURES := 0 +else ifeq ($(findstring zip,$(MAKECMDGOALS)),zip) DETECT_FEATURES := 0 endif @@ -230,7 +232,7 @@ endif # IS_MINGW # Newlib needs _XOPEN_SOURCE=600 for signals TPROG = TestPrograms/test_newlib.cpp -HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w) +HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) ifeq ($(strip $(HAVE_OPT)),0) ifeq ($(findstring -D_XOPEN_SOURCE,$(CXXFLAGS)),) CRYPTOPP_CXXFLAGS += -D_XOPEN_SOURCE=600 @@ -286,7 +288,9 @@ ifeq ($(DETECT_FEATURES),1) CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_ASM endif + # Need SSE2 or higher for these tests ifneq ($(SSE2_FLAG),) + TPROG = TestPrograms/test_x86_ssse3.cpp TOPT = $(SSSE3_FLAG) HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) @@ -295,6 +299,8 @@ ifeq ($(DETECT_FEATURES),1) CHAM_FLAG = $(SSSE3_FLAG) KECCAK_FLAG = $(SSSE3_FLAG) LEA_FLAG = $(SSSE3_FLAG) + LSH256_FLAG = $(SSSE3_FLAG) + LSH512_FLAG = $(SSSE3_FLAG) SIMON128_FLAG = $(SSSE3_FLAG) SPECK128_FLAG = $(SSSE3_FLAG) SUN_LDFLAGS += $(SSSE3_FLAG) @@ -302,6 +308,12 @@ ifeq ($(DETECT_FEATURES),1) SSSE3_FLAG = endif + # The first Apple MacBooks were Core2's with SSE4.1 + ifneq ($(IS_DARWIN),0) + # Add SSE2 algo's here as required + # They get a free upgrade + endif + TPROG = TestPrograms/test_x86_sse41.cpp TOPT = $(SSE41_FLAG) HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) @@ -360,6 +372,8 @@ ifeq ($(DETECT_FEATURES),1) HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) ifeq ($(strip $(HAVE_OPT)),0) CHACHA_AVX2_FLAG = $(AVX2_FLAG) + LSH256_AVX2_FLAG = $(AVX2_FLAG) + LSH512_AVX2_FLAG = $(AVX2_FLAG) SUN_LDFLAGS += $(AVX2_FLAG) else AVX2_FLAG = @@ -420,7 +434,7 @@ ifeq ($(DETECT_FEATURES),1) # CRYPTOPP_DISABLE_MIXED_ASM is now being added in config_asm.h for all # Clang compilers. This test will need to be re-enabled if Clang fixes it. #TPROG = TestPrograms/test_asm_mixed.cpp - #HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w) + #HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) #ifneq ($(strip $(HAVE_OPT)),0) # CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_MIXED_ASM #endif @@ -1057,7 +1071,7 @@ endif # Valgrind # Newlib test due to http://sourceware.org/bugzilla/show_bug.cgi?id=20268 ifneq ($(filter -DDEBUG -DDEBUG=1,$(CXXFLAGS)),) TPROG = TestPrograms/test_cxx.cpp - USING_GLIBCXX := $(shell $(CXX)$(CXXFLAGS) -E $(TPROG) -o $(TOUT) 2>&1 | $(GREP) -i -c "__GLIBCXX__") + USING_GLIBCXX := $(shell $(CXX)$(CXXFLAGS) -E $(TPROG) -c 2>&1 | $(GREP) -i -c "__GLIBCXX__") ifneq ($(USING_GLIBCXX),0) ifeq ($(HAS_NEWLIB),0) ifeq ($(findstring -D_GLIBCXX_DEBUG,$(CXXFLAGS)),) @@ -1621,6 +1635,22 @@ keccak_simd.o : keccak_simd.cpp lea_simd.o : lea_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LEA_FLAG) -c) $< +# SSSE3 available +lsh256_sse.o : lsh256_sse.cpp + $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_FLAG) -c) $< + +# AVX2 available +lsh256_avx.o : lsh256_avx.cpp + $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_AVX2_FLAG) -c) $< + +# SSSE3 available +lsh512_sse.o : lsh512_sse.cpp + $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_FLAG) -c) $< + +# AVX2 available +lsh512_avx.o : lsh512_avx.cpp + $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_AVX2_FLAG) -c) $< + # NEON available neon_simd.o : neon_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(NEON_FLAG) -c) $< diff --git a/GNUmakefile-cross b/GNUmakefile-cross index 33c930f9..46428fb7 100644 --- a/GNUmakefile-cross +++ b/GNUmakefile-cross @@ -46,11 +46,11 @@ endif IS_LINUX := $(shell echo $(MACHINEX) | $(GREP) -i -c "Linux") -# Can be used by Android and Embeeded cross-compiles. Disable by default because +# Can be used by Android and Embedded cross-compiles. Disable by default because # Android and embedded users typically don't run this configuration. HAS_SOLIB_VERSION ?= 0 -# Formely adhoc.cpp was created from adhoc.cpp.proto when needed. +# Formerly adhoc.cpp was created from adhoc.cpp.proto when needed. # This is now needed because ISA tests are performed using adhoc.cpp. ifeq ($(wildcard adhoc.cpp),) $(shell cp adhoc.cpp.proto adhoc.cpp) @@ -192,9 +192,9 @@ else ifeq ($(findstring clean,$(MAKECMDGOALS)),clean) DETECT_FEATURES := 0 else ifeq ($(findstring distclean,$(MAKECMDGOALS)),distclean) DETECT_FEATURES := 0 -else ifeq ($(findstring distclean,$(MAKECMDGOALS)),trim) +else ifeq ($(findstring trim,$(MAKECMDGOALS)),trim) DETECT_FEATURES := 0 -else ifeq ($(IS_IOS),1) +else ifeq ($(findstring zip,$(MAKECMDGOALS)),zip) DETECT_FEATURES := 0 endif @@ -249,6 +249,7 @@ ifeq ($(DETECT_FEATURES),1) CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_ASM endif + # Need SSE2 or higher for these tests ifneq ($(SSE2_FLAG),) TPROG = TestPrograms/test_x86_ssse3.cpp TOPT = $(SSSE3_FLAG) @@ -258,20 +259,26 @@ ifeq ($(DETECT_FEATURES),1) CHAM_FLAG = $(SSSE3_FLAG) KECCAK_FLAG = $(SSSE3_FLAG) LEA_FLAG = $(SSSE3_FLAG) + LSH256_FLAG = $(SSSE3_FLAG) + LSH512_FLAG = $(SSSE3_FLAG) SIMON128_FLAG = $(SSSE3_FLAG) SPECK128_FLAG = $(SSSE3_FLAG) - SUN_LDFLAGS += $(SSSE3_FLAG) else SSSE3_FLAG = endif + # The first Apple MacBooks were Core2's with SSE4.1 + ifneq ($(IS_DARWIN),0) + # Add SSE2 algo's here as required + # They get a free upgrade + endif + TPROG = TestPrograms/test_x86_sse41.cpp TOPT = $(SSE41_FLAG) HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) ifeq ($(strip $(HAVE_OPT)),0) BLAKE2B_FLAG = $(SSE41_FLAG) BLAKE2S_FLAG = $(SSE41_FLAG) - SUN_LDFLAGS += $(SSE41_FLAG) else SSE41_FLAG = endif @@ -281,7 +288,6 @@ ifeq ($(DETECT_FEATURES),1) HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) ifeq ($(strip $(HAVE_OPT)),0) CRC_FLAG = $(SSE42_FLAG) - SUN_LDFLAGS += $(SSE42_FLAG) else SSE42_FLAG = endif @@ -292,7 +298,6 @@ ifeq ($(DETECT_FEATURES),1) ifeq ($(strip $(HAVE_OPT)),0) GCM_FLAG = $(SSSE3_FLAG) $(CLMUL_FLAG) GF2N_FLAG = $(CLMUL_FLAG) - SUN_LDFLAGS += $(CLMUL_FLAG) else CLMUL_FLAG = endif @@ -303,7 +308,6 @@ ifeq ($(DETECT_FEATURES),1) ifeq ($(strip $(HAVE_OPT)),0) AES_FLAG = $(SSE41_FLAG) $(AESNI_FLAG) SM4_FLAG = $(SSSE3_FLAG) $(AESNI_FLAG) - SUN_LDFLAGS += $(AESNI_FLAG) else AESNI_FLAG = endif @@ -313,7 +317,6 @@ ifeq ($(DETECT_FEATURES),1) HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) ifeq ($(strip $(HAVE_OPT)),0) # XXX_FLAG = $(AVX_FLAG) - SUN_LDFLAGS += $(AVX_FLAG) else AVX_FLAG = endif @@ -323,7 +326,8 @@ ifeq ($(DETECT_FEATURES),1) HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) ifeq ($(strip $(HAVE_OPT)),0) CHACHA_AVX2_FLAG = $(AVX2_FLAG) - SUN_LDFLAGS += $(AVX2_FLAG) + LSH256_AVX2_FLAG = $(AVX2_FLAG) + LSH512_AVX2_FLAG = $(AVX2_FLAG) else AVX2_FLAG = endif @@ -333,15 +337,10 @@ ifeq ($(DETECT_FEATURES),1) HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) ifeq ($(strip $(HAVE_OPT)),0) SHA_FLAG = $(SSE42_FLAG) $(SHANI_FLAG) - SUN_LDFLAGS += $(SHANI_FLAG) else SHANI_FLAG = endif - ifeq ($(SUN_COMPILER),1) - CRYPTOPP_LDFLAGS += $(SUN_LDFLAGS) - endif - ifeq ($(SSE3_FLAG),) CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_SSE3 else ifeq ($(SSSE3_FLAG),) @@ -383,7 +382,7 @@ ifeq ($(DETECT_FEATURES),1) # CRYPTOPP_DISABLE_MIXED_ASM is now being added in config_asm.h for all # Clang compilers. This test will need to be re-enabled if Clang fixes it. #TPROG = TestPrograms/test_asm_mixed.cpp - #HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w) + #HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w) #ifneq ($(strip $(HAVE_OPT)),0) # CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_MIXED_ASM #endif @@ -989,6 +988,22 @@ keccak_simd.o : keccak_simd.cpp lea_simd.o : lea_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LEA_FLAG) -c) $< +# SSSE3 available +lsh256_sse.o : lsh256_sse.cpp + $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_FLAG) -c) $< + +# AVX2 available +lsh256_avx.o : lsh256_avx.cpp + $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_AVX2_FLAG) -c) $< + +# SSSE3 available +lsh512_sse.o : lsh512_sse.cpp + $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_FLAG) -c) $< + +# AVX2 available +lsh512_avx.o : lsh512_avx.cpp + $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_AVX2_FLAG) -c) $< + # NEON available neon_simd.o : neon_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(NEON_FLAG) -c) $< diff --git a/config_misc.h b/config_misc.h index 82d79229..67b2d4ab 100644 --- a/config_misc.h +++ b/config_misc.h @@ -191,30 +191,4 @@ # pragma GCC diagnostic ignored "-Wunused-function" #endif -// Requires ifunc support: GCC 4.8, Binutils 2.20.1 and libc 2.11.1. -// Should work for Clang 7 and above: https://stackoverflow.com/q/39958935, -// but fails with Clang 10: https://bugs.llvm.org/show_bug.cgi?id=50025. -// Should work with GCC 4.8.4 and 7.5.0 but does not: -// https://travis-ci.org/github/noloader/cryptopp-cmake/jobs/767701720 and -// https://travis-ci.org/github/noloader/cryptopp/jobs/767704226. -// Not available on Apple and Solaris platforms. Also see -// https://sourceware.org/glibc/wiki/GNU_IFUNC and -// https://gcc.gnu.org/onlinedocs/gcc/Function-Multiversioning.html. -#if !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__linux__) -# if defined(__i386__) || defined(__i686__) || defined(__amd64__) -# if (CRYPTOPP_GCC_VERSION >= 80000) || (CRYPTOPP_LLVM_CLANG_VERSION >= 130000) -# include -# define CRYPTOPP_HAVE_ATTRIBUTE_TARGET 1 -# define CRYPTOPP_TARGET_DEFAULT __attribute__ ((target ("default"))) -# define CRYPTOPP_TARGET_SSSE3 __attribute__ ((target ("ssse3"))) -# endif -# endif -# endif -#endif - -#ifndef CRYPTOPP_TARGET_DEFAULT -# define CRYPTOPP_TARGET_DEFAULT -#endif - #endif // CRYPTOPP_CONFIG_MISC_H diff --git a/cryptest.nmake b/cryptest.nmake index b16482bf..b4b47973 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -70,20 +70,21 @@ LIB_SRCS = \ gfpcrypt.cpp gost.cpp gzip.cpp hc128.cpp hc256.cpp hex.cpp hight.cpp \ hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp \ kalynatab.cpp keccak.cpp keccak_core.cpp keccak_simd.cpp lea.cpp \ - lea_simd.cpp lsh256.cpp lsh512.cpp luc.cpp mars.cpp marss.cpp md2.cpp \ - md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp \ - oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp \ - polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabbit.cpp rabin.cpp \ - randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp \ - rijndael.cpp rijndael_simd.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp \ - safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha.cpp \ - sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp shake.cpp shark.cpp \ - sharkbox.cpp simeck.cpp simon.cpp simon128_simd.cpp skipjack.cpp sm3.cpp \ - sm4.cpp sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp \ - square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \ - threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \ - vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp \ - zdeflate.cpp zinflate.cpp zlib.cpp + lea_simd.cpp lsh256.cpp lsh256_avx.cpp lsh256_sse.cpp lsh512.cpp \ + lsh512_avx.cpp lsh512_sse.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp \ + md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp oaep.cpp \ + osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp \ + pssr.cpp pubkey.cpp queue.cpp rabbit.cpp rabin.cpp randpool.cpp rc2.cpp \ + rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp rijndael_simd.cpp \ + ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp \ + seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp \ + shacal2_simd.cpp shake.cpp shark.cpp sharkbox.cpp simeck.cpp simon.cpp \ + simon128_simd.cpp skipjack.cpp sm3.cpp sm4.cpp sm4_simd.cpp \ + sosemanuk.cpp speck.cpp speck128_simd.cpp square.cpp squaretb.cpp \ + sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp \ + tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wake.cpp \ + whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp zdeflate.cpp \ + zinflate.cpp zlib.cpp LIB_OBJS = \ cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj \ @@ -100,20 +101,21 @@ LIB_OBJS = \ gfpcrypt.obj gost.obj gzip.obj hc128.obj hc256.obj hex.obj hight.obj \ hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj \ kalynatab.obj keccak.obj keccak_core.obj keccak_simd.obj lea.obj \ - lea_simd.obj lsh256.obj lsh512.obj luc.obj mars.obj marss.obj md2.obj \ - md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj \ - oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj \ - polynomi.obj pssr.obj pubkey.obj queue.obj rabbit.obj rabin.obj \ - randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj \ - rijndael.obj rijndael_simd.obj ripemd.obj rng.obj rsa.obj rw.obj \ - safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha.obj \ - sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj shake.obj shark.obj \ - sharkbox.obj simeck.obj simon.obj simon128_simd.obj skipjack.obj sm3.obj \ - sm4.obj sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj \ - square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \ - threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \ - vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj \ - zdeflate.obj zinflate.obj zlib.obj + lea_simd.obj lsh256.obj lsh256_avx.obj lsh256_sse.obj lsh512.obj \ + lsh512_avx.obj lsh512_sse.obj luc.obj mars.obj marss.obj md2.obj md4.obj \ + md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj oaep.obj \ + osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj \ + pssr.obj pubkey.obj queue.obj rabbit.obj rabin.obj randpool.obj rc2.obj \ + rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael.obj rijndael_simd.obj \ + ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj \ + seal.obj seed.obj serpent.obj sha.obj sha3.obj sha_simd.obj shacal2.obj \ + shacal2_simd.obj shake.obj shark.obj sharkbox.obj simeck.obj simon.obj \ + simon128_simd.obj skipjack.obj sm3.obj sm4.obj sm4_simd.obj \ + sosemanuk.obj speck.obj speck128_simd.obj square.obj squaretb.obj \ + sse_simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj \ + tigertab.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wake.obj \ + whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj zdeflate.obj \ + zinflate.obj zlib.obj ASM_OBJS = \ rdrand-x86.obj rdrand-x64.obj rdseed-x86.obj rdseed-x64.obj x64masm.obj x64dll.obj @@ -311,6 +313,10 @@ x64dll.obj: x64dll.asm !IF "$(PLATFORM)" == "x64" || "$(PLATFORM)" == "X64" || "$(PLATFORM)" == "amd64" || "$(PLATFORM)" == "x86" || "$(PLATFORM)" == "X86" chacha_avx.obj: $(CXX) $(CXXFLAGS) /arch:AVX /c chacha_avx.cpp +lsh256_avx.obj: + $(CXX) $(CXXFLAGS) /arch:AVX /c lsh256_avx.cpp +lsh512_avx.obj: + $(CXX) $(CXXFLAGS) /arch:AVX /c lsh512_avx.cpp !endif # For testing cryptopp.dll and CRYPTOPP_IMPORTS diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index 285fe9df..29e722f7 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -263,7 +263,11 @@ + + + + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index 8bbe8130..47714ac0 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -275,9 +275,21 @@ Source Files + + Source Files + + + Source Files + Source Files + + Source Files + + + Source Files + Source Files diff --git a/darn.cpp b/darn.cpp index 88d5de56..7f586bea 100644 --- a/darn.cpp +++ b/darn.cpp @@ -15,7 +15,7 @@ // GCC inline assembly or the builtin will fail the compile. // Inline assembler available in GCC 3.2 or above. For practical -// purposes we check for GCC 4.0 or above. GCC imposters claim +// purposes we check for GCC 4.0 or above. GCC impostors claim // to be GCC 4.2.1 so it will capture them, too. We exclude the // Apple machines because they are not Power9 and use a slightly // different syntax in their assembler. diff --git a/datatest.cpp b/datatest.cpp index cea57f79..f3a8766f 100644 --- a/datatest.cpp +++ b/datatest.cpp @@ -241,15 +241,15 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo } else if (s1.substr(0, 2) == "0x") { - std::string::size_type pos = s1.find(' '); - StringSource(s1.substr(2, pos), true, new HexDecoder(new StringSink(s2))); - s1 = s1.substr(STDMIN(pos, s1.length())); + std::string::size_type n = s1.find(' '); + StringSource(s1.substr(2, n), true, new HexDecoder(new StringSink(s2))); + s1 = s1.substr(STDMIN(n, s1.length())); } else { - std::string::size_type pos = s1.find(' '); - StringSource(s1.substr(0, pos), true, new HexDecoder(new StringSink(s2))); - s1 = s1.substr(STDMIN(pos, s1.length())); + std::string::size_type n = s1.find(' '); + StringSource(s1.substr(0, n), true, new HexDecoder(new StringSink(s2))); + s1 = s1.substr(STDMIN(n, s1.length())); } while (repeat--) @@ -850,8 +850,8 @@ void TestSymmetricCipherWithFileSource(TestData &v, const NameValuePairs &overri if (encrypted != ciphertext) { std::cout << "\nincorrectly encrypted: "; - StringSource ss(encrypted, false, new HexEncoder(new FileSink(std::cout))); - ss.Pump(2048); ss.Flush(false); + StringSource sss(encrypted, false, new HexEncoder(new FileSink(std::cout))); + sss.Pump(2048); sss.Flush(false); std::cout << "\n"; SignalTestFailure(); } @@ -867,8 +867,8 @@ void TestSymmetricCipherWithFileSource(TestData &v, const NameValuePairs &overri if (decrypted != plaintext) { std::cout << "\nincorrectly decrypted: "; - StringSource ss(decrypted, false, new HexEncoder(new FileSink(std::cout))); - ss.Pump(256); ss.Flush(false); + StringSource sss(decrypted, false, new HexEncoder(new FileSink(std::cout))); + sss.Pump(256); sss.Flush(false); std::cout << "\n"; SignalTestFailure(); } diff --git a/lsh.h b/lsh.h index 68284f97..2f9918d8 100644 --- a/lsh.h +++ b/lsh.h @@ -4,6 +4,11 @@ // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. +// We are hitting some sort of GCC bug in the LSH AVX2 code path. +// Clang is OK on the AVX2 code path. We believe it is GCC Issue +// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It +// makes using zeroupper a little tricky. + /// \file lsh.h /// \brief Classes for the LSH hash functions /// \since Crypto++ 8.6 @@ -15,6 +20,12 @@ #include "cryptlib.h" #include "secblock.h" +// Enable SSE2 and AVX2 for 64-bit machines. +// 32-bit machines slow down with SSE2. +#if (CRYPTOPP_BOOL_X32) || (CRYPTOPP_BOOL_X64) +# define CRYPTOPP_ENABLE_64BIT_SSE 1 +#endif + NAMESPACE_BEGIN(CryptoPP) /// \brief LSH-224 and LSH-256 hash base class @@ -34,14 +45,14 @@ public: unsigned int OptimalDataAlignment() const { return GetAlignmentOf(); } void Restart(); - void Update(const byte *input, size_t length); + void Update(const byte *input, size_t size); void TruncatedFinal(byte *hash, size_t size); std::string AlgorithmProvider() const; protected: LSH256_Base(unsigned int algType, unsigned int digestSize) - : m_algType(algType), m_digestSize(digestSize) {} + : m_digestSize(digestSize) { m_state[80] = algType; } protected: // Working state is: @@ -52,8 +63,10 @@ protected: // * submsg_o_l = 8 32-bit words // * submsg_o_r = 8 32-bit words // * last_block = 32 32-bit words (128 bytes) - FixedSizeSecBlock m_state; - word32 m_algType, m_remainingBitLength; + // * algType + // * remainingBitLength + FixedSizeSecBlock m_state; + // word32 m_algType, m_remainingBitLength; word32 m_digestSize; }; @@ -132,14 +145,14 @@ public: unsigned int OptimalDataAlignment() const { return GetAlignmentOf(); } void Restart(); - void Update(const byte *input, size_t length); + void Update(const byte *input, size_t size); void TruncatedFinal(byte *hash, size_t size); std::string AlgorithmProvider() const; protected: LSH512_Base(unsigned int algType, unsigned int digestSize) - : m_algType(algType), m_digestSize(digestSize) {} + : m_digestSize(digestSize) { m_state[80] = algType; } protected: // Working state is: @@ -150,8 +163,10 @@ protected: // * submsg_o_l = 8 64-bit words // * submsg_o_r = 8 64-bit words // * last_block = 32 64-bit words (256 bytes) - FixedSizeSecBlock m_state; - word32 m_algType, m_remainingBitLength; + // * algType + // * remainingBitLength + FixedSizeSecBlock m_state; + // word32 m_algType, m_remainingBitLength; word32 m_digestSize; }; diff --git a/lsh256.cpp b/lsh256.cpp index 6fcc58c6..0f895bb4 100644 --- a/lsh256.cpp +++ b/lsh256.cpp @@ -4,149 +4,20 @@ // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. -// The source file below uses GCC's function multiversioning to -// speed up a rotate. When the rotate is performed with the SSE -// unit there's a 2.5 to 3.0 cpb profit. AVX and AVX2 code paths -// slow down with multiversioning. It looks like GCC inserts calls -// to zeroupper() in each AVX function rather than deferring until -// the end of Restart(), Update() or Final(). That mistake costs -// about 3 cpb. - -// Function multiversioning does not work with Clang. Enabling it for -// LLVM Clang 7.0 and above resulted in linker errors. Also see -// https://bugs.llvm.org/show_bug.cgi?id=50025. - -// We are hitting some sort of GCC bug in the LSH256 AVX2 code path. -// Clang is OK on the AVX2 code path. When we enable AVX2 for -// rotate_msg_gamma, msg_exp_even and msg_exp_odd, then GCC arrives -// at the wrong result. Making any one of the functions SSE2 clears -// the problem. See CRYPTOPP_WORKAROUND_AVX2_BUG below. - -// TODO: cut-over to a *_simd.cpp file for proper runtime dispatching. +// We are hitting some sort of GCC bug in the LSH AVX2 code path. +// Clang is OK on the AVX2 code path. We believe it is GCC Issue +// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It +// makes using zeroupper a little tricky. #include "pch.h" #include "config.h" #include "lsh.h" +#include "cpu.h" #include "misc.h" -// Only enable the intrinsics for 64-bit machines -#ifndef CRYPTOPP_DISABLE_ASM -# if (defined(__SSE2__) && defined(__amd64__)) || (defined(_MSC_VER) && defined(_M_X64)) -# define CRYPTOPP_LSH256_SSE2_AVAILABLE 1 -# endif -# if defined(__SSSE3__) && defined(__amd64__) -# define CRYPTOPP_LSH256_SSSE3_AVAILABLE 1 -# endif -# if defined(__XOP__) && defined(__amd64__) -# define CRYPTOPP_LSH256_XOP_AVAILABLE 1 -# endif -# if defined(__AVX__) && defined(__amd64__) -# define CRYPTOPP_LSH256_AVX_AVAILABLE 1 -# endif -# if defined(__AVX2__) && defined(__amd64__) -# define CRYPTOPP_LSH256_AVX2_AVAILABLE 1 -# endif -#endif - -#if defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) -# include -#endif - -#if defined(CRYPTOPP_LSH256_SSSE3_AVAILABLE) -# include -#endif - -#if defined(CRYPTOPP_LSH256_XOP_AVAILABLE) -# include -#endif - -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) -# include -#endif - -#if defined(CRYPTOPP_HAVE_ATTRIBUTE_TARGET) -# include -#endif - -#if defined(__GNUC__) && defined(__amd64__) -# include -#endif - -// Use GCC_VERSION to avoid Clang, ICC and other impostors -#if defined(CRYPTOPP_GCC_VERSION) -# define CRYPTOPP_WORKAROUND_AVX2_BUG 1 -#endif - ANONYMOUS_NAMESPACE_BEGIN -using CryptoPP::byte; -using CryptoPP::word32; -using CryptoPP::rotlFixed; -using CryptoPP::rotlConstant; - -using CryptoPP::GetBlock; -using CryptoPP::LittleEndian; -using CryptoPP::ConditionalByteReverse; -using CryptoPP::LITTLE_ENDIAN_ORDER; - -typedef byte lsh_u8; -typedef word32 lsh_u32; -typedef word32 lsh_uint; -typedef word32 lsh_err; -typedef word32 lsh_type; - -struct LSH256_Context -{ - LSH256_Context(word32* state, word32 algType, word32& remainingBitLength) : - cv_l(state+0), cv_r(state+8), sub_msgs(state+16), - last_block(reinterpret_cast(state+48)) , - remain_databitlen(remainingBitLength), algtype(algType) {} - - lsh_u32* cv_l; // start of our state block - lsh_u32* cv_r; - lsh_u32* sub_msgs; - lsh_u8* last_block; - lsh_u32& remain_databitlen; - lsh_type algtype; -}; - -struct LSH256_Internal -{ - LSH256_Internal(word32* state) : - submsg_e_l(state+16), submsg_e_r(state+24), - submsg_o_l(state+32), submsg_o_r(state+40) { } - - lsh_u32* submsg_e_l; /* even left sub-message */ - lsh_u32* submsg_e_r; /* even right sub-message */ - lsh_u32* submsg_o_l; /* odd left sub-message */ - lsh_u32* submsg_o_r; /* odd right sub-message */ -}; - -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) -// Zero the upper 128 bits of all YMM registers -// on entry and exit. It avoids AVX state -// transition penalties when saving state. -struct AVX_Cleanup -{ - AVX_Cleanup() { - _mm256_zeroupper(); - } - ~AVX_Cleanup() { - _mm256_zeroupper(); - } -}; -#endif - -// error: '_mm256_set_m128i' was not declared in this scope? -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) -inline __m256i _MM256_SET_M128I(__m128i hi, __m128i lo) -{ - return _mm256_insertf128_si256 ( - _mm256_castsi128_si256(lo), hi, 1); -} -#endif - /* LSH Constants */ const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128; @@ -180,55 +51,44 @@ const unsigned int LSH_SUCCESS = 0x0; const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; const unsigned int LSH_ERR_INVALID_STATE = 0x2404; -/* LSH AlgType Macro */ +/* Index into our state array */ -inline bool LSH_IS_LSH512(lsh_uint val) { - return (val & 0xf0000) == 0; -} +const unsigned int AlgorithmType = 80; +const unsigned int RemainingBits = 81; -inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { - return val >> 24; -} +NAMESPACE_END -inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { - return val & 0xffff; -} - -inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { - return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); -} - -inline lsh_u32 loadLE32(lsh_u32 v) { - return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); -} - -lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) { - return rotlFixed(x, r); -} +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(LSH) /* -------------------------------------------------------- * * LSH: iv * -------------------------------------------------------- */ +//extern const word32 LSH256_IV224[CV_WORD_LEN]; +//extern const word32 LSH256_IV256[CV_WORD_LEN]; +//extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS]; + CRYPTOPP_ALIGN_DATA(32) -const lsh_u32 g_IV224[CV_WORD_LEN] = { +extern +const word32 LSH256_IV224[CV_WORD_LEN] = { 0x068608D3, 0x62D8F7A7, 0xD76652AB, 0x4C600A43, 0xBDC40AA8, 0x1ECA0B68, 0xDA1A89BE, 0x3147D354, - 0x707EB4F9, 0xF65B3862, 0x6B0B2ABE, 0x56B8EC0A, 0xCF237286, 0xEE0D1727, 0x33636595, 0x8BB8D05F, + 0x707EB4F9, 0xF65B3862, 0x6B0B2ABE, 0x56B8EC0A, 0xCF237286, 0xEE0D1727, 0x33636595, 0x8BB8D05F }; CRYPTOPP_ALIGN_DATA(32) -const lsh_u32 g_IV256[CV_WORD_LEN] = { +extern +const word32 LSH256_IV256[CV_WORD_LEN] = { 0x46a10f1f, 0xfddce486, 0xb41443a8, 0x198e6b9d, 0x3304388d, 0xb0f5a3c7, 0xb36061c4, 0x7adbd553, 0x105d5378, 0x2f74de54, 0x5c2f2d95, 0xf2553fbe, 0x8051357a, 0x138668c8, 0x47aa4484, 0xe01afb41 }; -const lsh_uint g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; - /* -------------------------------------------------------- * * LSH: step constants * -------------------------------------------------------- */ -const lsh_u32 g_StepConstants[CONST_WORD_LEN * NUM_STEPS] = { +extern +const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS] = { 0x917caf90, 0x6c1b10a2, 0x6f352943, 0xcf778243, 0x2ceb7472, 0x29e96ff2, 0x8a9ba428, 0x2eeb2642, 0x0e2c4021, 0x872bb30e, 0xa45e6cb2, 0x46f9c612, 0x185fe69e, 0x1359621b, 0x263fccb2, 0x1a116870, 0x3a6c612f, 0xb2dec195, 0x02cb1f56, 0x40bfd858, 0x784684b6, 0x6cbb7d2e, 0x660c7ed8, 0x2b79d88a, @@ -257,6 +117,87 @@ const lsh_u32 g_StepConstants[CONST_WORD_LEN * NUM_STEPS] = { 0x592c0f3b, 0x947c5f77, 0x6fff49b9, 0xf71a7e5a, 0x1de8c0f5, 0xc2569600, 0xc4e4ac8c, 0x823c9ce1 }; +NAMESPACE_END // LSH +NAMESPACE_END // Crypto++ + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::rotlFixed; +using CryptoPP::rotlConstant; + +using CryptoPP::GetBlock; +using CryptoPP::LittleEndian; +using CryptoPP::ConditionalByteReverse; +using CryptoPP::LITTLE_ENDIAN_ORDER; + +using CryptoPP::LSH::LSH256_IV224; +using CryptoPP::LSH::LSH256_IV256; +using CryptoPP::LSH::LSH256_StepConstants; + +typedef byte lsh_u8; +typedef word32 lsh_u32; +typedef word32 lsh_uint; +typedef word32 lsh_err; +typedef word32 lsh_type; + +struct LSH256_Context +{ + LSH256_Context(word32* state, word32 algType, word32& remainingBitLength) : + cv_l(state+0), cv_r(state+8), sub_msgs(state+16), + last_block(reinterpret_cast(state+48)), + remain_databitlen(remainingBitLength), + alg_type(static_cast(algType)) {} + + lsh_u32* cv_l; // start of our state block + lsh_u32* cv_r; + lsh_u32* sub_msgs; + lsh_u8* last_block; + lsh_u32& remain_databitlen; + lsh_type alg_type; +}; + +struct LSH256_Internal +{ + LSH256_Internal(word32* state) : + submsg_e_l(state+16), submsg_e_r(state+24), + submsg_o_l(state+32), submsg_o_r(state+40) { } + + lsh_u32* submsg_e_l; /* even left sub-message */ + lsh_u32* submsg_e_r; /* even right sub-message */ + lsh_u32* submsg_o_l; /* odd left sub-message */ + lsh_u32* submsg_o_r; /* odd right sub-message */ +}; + +const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; + +/* LSH AlgType Macro */ + +inline bool LSH_IS_LSH512(lsh_uint val) { + return (val & 0xf0000) == 0; +} + +inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { + return val >> 24; +} + +inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { + return val & 0xffff; +} + +inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { + return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); +} + +inline lsh_u32 loadLE32(lsh_u32 v) { + return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); +} + +lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) { + return rotlFixed(x, r); +} + // Original code relied upon unaligned lsh_u32 buffer inline void load_msg_blk(LSH256_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN]) { @@ -267,34 +208,6 @@ inline void load_msg_blk(LSH256_Internal* i_state, const lsh_u8 msgblk[LSH256_MS lsh_u32* submsg_o_l = i_state->submsg_o_l; lsh_u32* submsg_o_r = i_state->submsg_o_r; -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - _mm256_storeu_si256(M256_CAST(submsg_e_l+0), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+0))); - _mm256_storeu_si256(M256_CAST(submsg_e_r+0), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+32))); - _mm256_storeu_si256(M256_CAST(submsg_o_l+0), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+64))); - _mm256_storeu_si256(M256_CAST(submsg_o_r+0), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+96))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(submsg_e_l+0), - _mm_loadu_si128(CONST_M128_CAST(msgblk+0))); - _mm_storeu_si128(M128_CAST(submsg_e_l+4), - _mm_loadu_si128(CONST_M128_CAST(msgblk+16))); - _mm_storeu_si128(M128_CAST(submsg_e_r+0), - _mm_loadu_si128(CONST_M128_CAST(msgblk+32))); - _mm_storeu_si128(M128_CAST(submsg_e_r+4), - _mm_loadu_si128(CONST_M128_CAST(msgblk+48))); - _mm_storeu_si128(M128_CAST(submsg_o_l+0), - _mm_loadu_si128(CONST_M128_CAST(msgblk+64))); - _mm_storeu_si128(M128_CAST(submsg_o_l+4), - _mm_loadu_si128(CONST_M128_CAST(msgblk+80))); - _mm_storeu_si128(M128_CAST(submsg_o_r+0), - _mm_loadu_si128(CONST_M128_CAST(msgblk+96))); - _mm_storeu_si128(M128_CAST(submsg_o_r+4), - _mm_loadu_si128(CONST_M128_CAST(msgblk+112))); -#else typedef GetBlock InBlock; InBlock input(msgblk); @@ -306,7 +219,6 @@ inline void load_msg_blk(LSH256_Internal* i_state, const lsh_u8 msgblk[LSH256_MS (submsg_o_l[4])(submsg_o_l[5])(submsg_o_l[6])(submsg_o_l[7]) (submsg_o_r[0])(submsg_o_r[1])(submsg_o_r[2])(submsg_o_r[3]) (submsg_o_r[4])(submsg_o_r[5])(submsg_o_r[6])(submsg_o_r[7]); -#endif } inline void msg_exp_even(LSH256_Internal* i_state) @@ -318,44 +230,6 @@ inline void msg_exp_even(LSH256_Internal* i_state) lsh_u32* submsg_o_l = i_state->submsg_o_l; lsh_u32* submsg_o_r = i_state->submsg_o_r; -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514, - 0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c); - - _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi32( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), - _mm256_shuffle_epi8( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), mask))); - _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi32( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), - _mm256_shuffle_epi8( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), mask))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi32( - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(3,2,1,0)), - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(1,0,2,3)))); - - _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi32( - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(3,2,1,0)), - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(2,1,0,3)))); - - _mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi32( - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(3,2,1,0)), - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(1,0,2,3)))); - - _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi32( - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(3,2,1,0)), - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(2,1,0,3)))); -#else lsh_u32 temp; temp = submsg_e_l[0]; submsg_e_l[0] = submsg_o_l[0] + submsg_e_l[3]; @@ -377,7 +251,6 @@ inline void msg_exp_even(LSH256_Internal* i_state) submsg_e_r[7] = submsg_o_r[7] + submsg_e_r[6]; submsg_e_r[6] = submsg_o_r[6] + submsg_e_r[5]; submsg_e_r[5] = submsg_o_r[5] + temp; -#endif } inline void msg_exp_odd(LSH256_Internal* i_state) @@ -389,44 +262,6 @@ inline void msg_exp_odd(LSH256_Internal* i_state) lsh_u32* submsg_o_l = i_state->submsg_o_l; lsh_u32* submsg_o_r = i_state->submsg_o_r; -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514, - 0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c); - - _mm256_storeu_si256(M256_CAST(submsg_o_l+0), _mm256_add_epi32( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), - _mm256_shuffle_epi8( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), mask))); - _mm256_storeu_si256(M256_CAST(submsg_o_r+0), _mm256_add_epi32( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), - _mm256_shuffle_epi8( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), mask))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi32( - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(3,2,1,0)), - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(1,0,2,3)))); - - _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi32( - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(3,2,1,0)), - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(2,1,0,3)))); - - _mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi32( - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(3,2,1,0)), - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(1,0,2,3)))); - - _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi32( - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(3,2,1,0)), - _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(2,1,0,3)))); -#else lsh_u32 temp; temp = submsg_o_l[0]; submsg_o_l[0] = submsg_e_l[0] + submsg_o_l[3]; @@ -448,14 +283,13 @@ inline void msg_exp_odd(LSH256_Internal* i_state) submsg_o_r[7] = submsg_e_r[7] + submsg_o_r[6]; submsg_o_r[6] = submsg_e_r[6] + submsg_o_r[5]; submsg_o_r[5] = submsg_e_r[5] + temp; -#endif } inline void load_sc(const lsh_u32** p_const_v, size_t i) { CRYPTOPP_ASSERT(p_const_v != NULLPTR); - *p_const_v = &g_StepConstants[i]; + *p_const_v = &LSH256_StepConstants[i]; } inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_Internal* i_state) @@ -465,28 +299,6 @@ inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_Internal* i_st lsh_u32* submsg_e_l = i_state->submsg_e_l; lsh_u32* submsg_e_r = i_state->submsg_e_r; -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)))); - _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l+0), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+0)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); - _mm_storeu_si128(M128_CAST(cv_r+0), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)))); - _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); -#else cv_l[0] ^= submsg_e_l[0]; cv_l[1] ^= submsg_e_l[1]; cv_l[2] ^= submsg_e_l[2]; cv_l[3] ^= submsg_e_l[3]; cv_l[4] ^= submsg_e_l[4]; cv_l[5] ^= submsg_e_l[5]; @@ -495,7 +307,6 @@ inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_Internal* i_st cv_r[2] ^= submsg_e_r[2]; cv_r[3] ^= submsg_e_r[3]; cv_r[4] ^= submsg_e_r[4]; cv_r[5] ^= submsg_e_r[5]; cv_r[6] ^= submsg_e_r[6]; cv_r[7] ^= submsg_e_r[7]; -#endif } inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_Internal* i_state) @@ -505,28 +316,6 @@ inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_Internal* i_sta lsh_u32* submsg_o_l = i_state->submsg_o_l; lsh_u32* submsg_o_r = i_state->submsg_o_r; -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l)))); - _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_r)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r)))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l)))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); - _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r)))); - _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); -#else cv_l[0] ^= submsg_o_l[0]; cv_l[1] ^= submsg_o_l[1]; cv_l[2] ^= submsg_o_l[2]; cv_l[3] ^= submsg_o_l[3]; cv_l[4] ^= submsg_o_l[4]; cv_l[5] ^= submsg_o_l[5]; @@ -535,24 +324,10 @@ inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_Internal* i_sta cv_r[2] ^= submsg_o_r[2]; cv_r[3] ^= submsg_o_r[3]; cv_r[4] ^= submsg_o_r[4]; cv_r[5] ^= submsg_o_r[5]; cv_r[6] ^= submsg_o_r[6]; cv_r[7] ^= submsg_o_r[7]; -#endif } -inline void add_blk(lsh_u32 cv_l[8], const lsh_u32 cv_r[8]) +inline void add_blk(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) { -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi32( - _mm256_loadu_si256(CONST_M256_CAST(cv_l)), - _mm256_loadu_si256(CONST_M256_CAST(cv_r)))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi32( - _mm_loadu_si128(CONST_M128_CAST(cv_l)), - _mm_loadu_si128(CONST_M128_CAST(cv_r)))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi32( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(cv_r+4)))); -#else cv_l[0] += cv_r[0]; cv_l[1] += cv_r[1]; cv_l[2] += cv_r[2]; @@ -561,31 +336,11 @@ inline void add_blk(lsh_u32 cv_l[8], const lsh_u32 cv_r[8]) cv_l[5] += cv_r[5]; cv_l[6] += cv_r[6]; cv_l[7] += cv_r[7]; -#endif } template inline void rotate_blk(lsh_u32 cv[8]) { -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( - _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), - _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R))); - -#elif defined(CRYPTOPP_LSH256_XOP_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv), - _mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R)); - _mm_storeu_si128(M128_CAST(cv+4), - _mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R)); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv), _mm_or_si128( - _mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R), - _mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), 32-R))); - _mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128( - _mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R), - _mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 32-R))); -#else cv[0] = rotlConstant(cv[0]); cv[1] = rotlConstant(cv[1]); cv[2] = rotlConstant(cv[2]); @@ -594,24 +349,10 @@ inline void rotate_blk(lsh_u32 cv[8]) cv[5] = rotlConstant(cv[5]); cv[6] = rotlConstant(cv[6]); cv[7] = rotlConstant(cv[7]); -#endif } inline void xor_with_const(lsh_u32 cv_l[8], const lsh_u32 const_v[8]) { -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l)), - _mm256_loadu_si256(CONST_M256_CAST(const_v)))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l)), - _mm_loadu_si128(CONST_M128_CAST(const_v)))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(const_v+4)))); -#else cv_l[0] ^= const_v[0]; cv_l[1] ^= const_v[1]; cv_l[2] ^= const_v[2]; @@ -620,92 +361,20 @@ inline void xor_with_const(lsh_u32 cv_l[8], const lsh_u32 const_v[8]) cv_l[5] ^= const_v[5]; cv_l[6] ^= const_v[6]; cv_l[7] ^= const_v[7]; -#endif } -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) && !defined(CRYPTOPP_WORKAROUND_AVX2_BUG) inline void rotate_msg_gamma(lsh_u32 cv_r[8]) { - // g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; - _mm256_storeu_si256(M256_CAST(cv_r+0), - _mm256_shuffle_epi8(_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), - _mm256_set_epi8( - /* hi lane */ 15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1, - /* lo lane */ 12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0))); -} -#else // CRYPTOPP_LSH256_AVX2_AVAILABLE -# if defined(CRYPTOPP_HAVE_ATTRIBUTE_TARGET) -CRYPTOPP_TARGET_SSSE3 -inline void rotate_msg_gamma(lsh_u32 cv_r[8]) -{ - // g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; - _mm_storeu_si128(M128_CAST(cv_r+0), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), - _mm_set_epi8(12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0))); - _mm_storeu_si128(M128_CAST(cv_r+4), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_set_epi8(15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1))); -} -# endif - -CRYPTOPP_TARGET_DEFAULT -inline void rotate_msg_gamma(lsh_u32 cv_r[8]) -{ -#if defined(CRYPTOPP_LSH256_SSSE3_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_r+0), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), - _mm_set_epi8(12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0))); - _mm_storeu_si128(M128_CAST(cv_r+4), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_set_epi8(15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1))); - -#else cv_r[1] = rotlFixed(cv_r[1], g_gamma256[1]); cv_r[2] = rotlFixed(cv_r[2], g_gamma256[2]); cv_r[3] = rotlFixed(cv_r[3], g_gamma256[3]); cv_r[4] = rotlFixed(cv_r[4], g_gamma256[4]); cv_r[5] = rotlFixed(cv_r[5], g_gamma256[5]); cv_r[6] = rotlFixed(cv_r[6], g_gamma256[6]); -#endif } -#endif // CRYPTOPP_LSH256_AVX2_AVAILABLE inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) { -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - __m256i temp; - temp = _mm256_shuffle_epi32( - _mm256_loadu_si256(CONST_M256_CAST(cv_l)), _MM_SHUFFLE(3,1,0,2)); - _mm256_storeu_si256(M256_CAST(cv_r), - _mm256_shuffle_epi32( - _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(1,2,3,0))); - _mm256_storeu_si256(M256_CAST(cv_l), - _mm256_permute2x128_si256(temp, - _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,3,0,1))); - _mm256_storeu_si256(M256_CAST(cv_r), - _mm256_permute2x128_si256(temp, - _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,2,0,0))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l+0), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2))); - _mm_storeu_si128(M128_CAST(cv_r+0), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0))); - _mm_storeu_si128(M128_CAST(cv_r+4), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0))); - - __m128i temp = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); - _mm_storeu_si128(M128_CAST(cv_l+0), - _mm_loadu_si128(CONST_M128_CAST(cv_l+4))); - _mm_storeu_si128(M128_CAST(cv_l+4), - _mm_loadu_si128(CONST_M128_CAST(cv_r+4))); - _mm_storeu_si128(M128_CAST(cv_r+4), - _mm_loadu_si128(CONST_M128_CAST(cv_r+0))); - _mm_storeu_si128(M128_CAST(cv_r+0), temp); - -#else lsh_u32 temp; temp = cv_l[0]; cv_l[0] = cv_l[6]; @@ -725,8 +394,6 @@ inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) cv_l[7] = cv_r[5]; cv_r[5] = cv_r[3]; cv_r[3] = temp; - -#endif }; /* -------------------------------------------------------- * @@ -736,8 +403,6 @@ inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) template inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8]) { - CRYPTOPP_ASSERT(const_v != NULLPTR); - add_blk(cv_l, cv_r); rotate_blk(cv_l); xor_with_const(cv_l, const_v); @@ -762,10 +427,6 @@ inline void compress(LSH256_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_B lsh_u32* cv_l = ctx->cv_l; lsh_u32* cv_r = ctx->cv_r; -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - load_msg_blk(i_state, pdMsgBlk); msg_add_even(cv_l, cv_r, i_state); @@ -801,23 +462,6 @@ inline void compress(LSH256_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_B inline void load_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 iv[16]) { - // The IV's are 32-byte aligned so we can use aligned loads. -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l+0), - _mm256_load_si256(CONST_M256_CAST(iv+0))); - _mm256_storeu_si256(M256_CAST(cv_r+0), - _mm256_load_si256(CONST_M256_CAST(iv+8))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l+ 0), - _mm_load_si128(CONST_M128_CAST(iv+ 0))); - _mm_storeu_si128(M128_CAST(cv_l+ 4), - _mm_load_si128(CONST_M128_CAST(iv+ 4))); - _mm_storeu_si128(M128_CAST(cv_r+ 0), - _mm_load_si128(CONST_M128_CAST(iv+ 8))); - _mm_storeu_si128(M128_CAST(cv_r+ 4), - _mm_load_si128(CONST_M128_CAST(iv+12))); -#else cv_l[0] = iv[0]; cv_l[1] = iv[1]; cv_l[2] = iv[2]; @@ -834,73 +478,36 @@ inline void load_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 iv[16]) cv_r[5] = iv[13]; cv_r[6] = iv[14]; cv_r[7] = iv[15]; -#endif } inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) { -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256()); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128()); -#else memset(cv_l, 0x00, 8*sizeof(lsh_u32)); memset(cv_r, 0x00, 8*sizeof(lsh_u32)); -#endif } inline void zero_submsgs(LSH256_Context* ctx) { + CRYPTOPP_ASSERT(ctx != NULLPTR); + lsh_u32* sub_msgs = ctx->sub_msgs; - -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - _mm256_storeu_si256(M256_CAST(sub_msgs+ 0), _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(sub_msgs+ 8), _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(sub_msgs+16), _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(sub_msgs+24), _mm256_setzero_si256()); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(sub_msgs+ 0), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+ 4), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+ 8), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+12), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+16), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+20), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+24), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+28), _mm_setzero_si128()); - -#else memset(sub_msgs, 0x00, 32*sizeof(lsh_u32)); -#endif } inline void init224(LSH256_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - zero_submsgs(ctx); - load_iv(ctx->cv_l, ctx->cv_r, g_IV224); + load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224); } inline void init256(LSH256_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - zero_submsgs(ctx); - load_iv(ctx->cv_l, ctx->cv_r, g_IV256); + load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256); } /* -------------------------------------------------------- */ @@ -909,27 +516,9 @@ inline void fin(LSH256_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - AVX_Cleanup cleanup; -#endif - -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)), - _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0)))); - -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)), - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0)))); - _mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4)))); -#else for (size_t i = 0; i < HASH_VAL_MAX_WORD_LEN; i++){ ctx->cv_l[i] = loadLE32(ctx->cv_l[i] ^ ctx->cv_r[i]); } -#endif } /* -------------------------------------------------------- */ @@ -937,14 +526,14 @@ inline void fin(LSH256_Context* ctx) inline void get_hash(LSH256_Context* ctx, lsh_u8* pbHashVal) { CRYPTOPP_ASSERT(ctx != NULLPTR); - CRYPTOPP_ASSERT(ctx->algtype != 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); CRYPTOPP_ASSERT(pbHashVal != NULLPTR); - lsh_uint algtype = ctx->algtype; - lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(algtype); - lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(algtype); + lsh_uint alg_type = ctx->alg_type; + lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); + lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); - // Multiplying by sizeof(lsh_u8) looks odd... + // Multiplying by looks odd... memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); if (hash_val_bit_len){ pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); @@ -956,13 +545,13 @@ inline void get_hash(LSH256_Context* ctx, lsh_u8* pbHashVal) lsh_err lsh256_init(LSH256_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); - CRYPTOPP_ASSERT(ctx->algtype != 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); - lsh_u32 algtype = ctx->algtype; + lsh_u32 alg_type = ctx->alg_type; const lsh_u32* const_v = NULL; ctx->remain_databitlen = 0; - switch (algtype) + switch (alg_type) { case LSH_TYPE_256_256: init256(ctx); @@ -977,13 +566,9 @@ lsh_err lsh256_init(LSH256_Context* ctx) lsh_u32* cv_l = ctx->cv_l; lsh_u32* cv_r = ctx->cv_r; -#if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - zero_iv(cv_l, cv_r); cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN; - cv_l[1] = LSH_GET_HASHBIT(algtype); + cv_l[1] = LSH_GET_HASHBIT(alg_type); for (size_t i = 0; i < NUM_STEPS / 2; i++) { @@ -1005,19 +590,20 @@ lsh_err lsh256_update(LSH256_Context* ctx, const lsh_u8* data, size_t databitlen CRYPTOPP_ASSERT(ctx != NULLPTR); CRYPTOPP_ASSERT(data != NULLPTR); CRYPTOPP_ASSERT(databitlen % 8 == 0); - CRYPTOPP_ASSERT(ctx->algtype != 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); if (databitlen == 0){ return LSH_SUCCESS; } + // We are byte oriented. tail bits will always be 0. size_t databytelen = databitlen >> 3; - lsh_uint pos2 = databitlen & 0x7; + // lsh_uint pos2 = databitlen & 0x7; + const size_t pos2 = 0; - // We are byte oriented. remain_msg_bit will always be 0. - lsh_uint remain_msg_byte = ctx->remain_databitlen >> 3; + size_t remain_msg_byte = ctx->remain_databitlen >> 3; // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; - const lsh_uint remain_msg_bit = 0; + const size_t remain_msg_bit = 0; if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ return LSH_ERR_INVALID_STATE; @@ -1038,7 +624,7 @@ lsh_err lsh256_update(LSH256_Context* ctx, const lsh_u8* data, size_t databitlen } if (remain_msg_byte > 0){ - lsh_uint more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte; + size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte; memcpy(ctx->last_block + remain_msg_byte, data, more_byte); compress(ctx, ctx->last_block); data += more_byte; @@ -1075,10 +661,10 @@ lsh_err lsh256_final(LSH256_Context* ctx, lsh_u8* hashval) CRYPTOPP_ASSERT(ctx != NULLPTR); CRYPTOPP_ASSERT(hashval != NULLPTR); - // We are byte oriented. remain_msg_bit will always be 0. - lsh_uint remain_msg_byte = ctx->remain_databitlen >> 3; + // We are byte oriented. tail bits will always be 0. + size_t remain_msg_byte = ctx->remain_databitlen >> 3; // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; - const lsh_uint remain_msg_bit = 0; + const size_t remain_msg_bit = 0; if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ return LSH_ERR_INVALID_STATE; @@ -1104,70 +690,126 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) -std::string LSH256_Base::AlgorithmProvider() const -{ -#if defined(CRYPTOPP_LSH256_AVX2_AVAILABLE) - return "AVX2"; -#elif defined(CRYPTOPP_LSH256_AVX_AVAILABLE) - return "AVX"; -#elif defined(CRYPTOPP_LSH256_SSSE3_AVAILABLE) - return "SSSE3"; -#elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - return "SSE2"; -#else - return "C++"; +#if defined(CRYPTOPP_ENABLE_64BIT_SSE) +# if defined(CRYPTOPP_AVX2_AVAILABLE) + extern void LSH256_Base_Restart_AVX2(word32* state); + extern void LSH256_Base_Update_AVX2(word32* state, const byte *input, size_t size); + extern void LSH256_Base_TruncatedFinal_AVX2(word32* state, byte *hash, size_t size); +# endif +# if defined(CRYPTOPP_SSSE3_AVAILABLE) + extern void LSH256_Base_Restart_SSSE3(word32* state); + extern void LSH256_Base_Update_SSSE3(word32* state, const byte *input, size_t size); + extern void LSH256_Base_TruncatedFinal_SSSE3(word32* state, byte *hash, size_t size); +# endif #endif -} -void LSH256_Base::Restart() +void LSH256_Base_Restart_CXX(word32* state) { - m_remainingBitLength = 0; - - LSH256_Context ctx(m_state, m_algType, m_remainingBitLength); + state[RemainingBits] = 0; + LSH256_Context ctx(state, state[AlgorithmType], state[RemainingBits]); lsh_err err = lsh256_init(&ctx); if (err != LSH_SUCCESS) throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_init failed"); } -void LSH256_Base::Update(const byte *input, size_t length) +void LSH256_Base_Update_CXX(word32* state, const byte *input, size_t size) { - CRYPTOPP_ASSERT(input != NULLPTR); - CRYPTOPP_ASSERT(length); - - LSH256_Context ctx(m_state, m_algType, m_remainingBitLength); - lsh_err err = lsh256_update(&ctx, input, 8*length); + LSH256_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh256_update(&ctx, input, 8*size); if (err != LSH_SUCCESS) throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_update failed"); } +void LSH256_Base_TruncatedFinal_CXX(word32* state, byte *hash, size_t) +{ + LSH256_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh256_final(&ctx, hash); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_final failed"); +} + +std::string LSH256_Base::AlgorithmProvider() const +{ +#if defined(CRYPTOPP_ENABLE_64BIT_SSE) +#if defined(CRYPTOPP_AVX2_AVAILABLE) + if (HasAVX2()) + return "AVX2"; + else +#endif +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + if (HasSSSE3()) + return "SSSE3"; + else +#endif +#endif // CRYPTOPP_ENABLE_64BIT_SSE + + return "C++"; +} + +void LSH256_Base::Restart() +{ +#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasAVX2()) + LSH256_Base_Restart_AVX2(m_state); + else +#endif +#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasSSSE3()) + LSH256_Base_Restart_SSSE3(m_state); + else +#endif + + LSH256_Base_Restart_CXX(m_state); +} + +void LSH256_Base::Update(const byte *input, size_t size) +{ + CRYPTOPP_ASSERT(input != NULLPTR); + CRYPTOPP_ASSERT(size); + +#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasAVX2()) + LSH256_Base_Update_AVX2(m_state, input, size); + else +#endif +#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasSSSE3()) + LSH256_Base_Update_SSSE3(m_state, input, size); + else +#endif + + LSH256_Base_Update_CXX(m_state, input, size); +} + void LSH256_Base::TruncatedFinal(byte *hash, size_t size) { CRYPTOPP_ASSERT(hash != NULLPTR); ThrowIfInvalidTruncatedSize(size); - LSH256_Context ctx(m_state, m_algType, m_remainingBitLength); - lsh_err err; + // TODO: determine if LSH256 supports truncated hashes. See the code + // in get_hash(), where a bit-length is added to the last output + // byte of the hash function. + byte fullHash[LSH256_HASH_VAL_MAX_BYTE_LEN]; + bool copyOut = (size < DigestSize()); - if (size >= DigestSize()) - { - err = lsh256_final(&ctx, hash); - } +#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasAVX2()) + LSH256_Base_TruncatedFinal_AVX2(m_state, copyOut ? fullHash : hash, size); else - { - // TODO: determine if LSH256 supports truncated hashes. See the code - // in get_hash(), where a bit-length is added to the last output - // byte of the hash function. - // CRYPTOPP_ASSERT(0); +#endif +#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasSSSE3()) + LSH256_Base_TruncatedFinal_SSSE3(m_state, copyOut ? fullHash : hash, size); + else +#endif - byte fullHash[HASH_VAL_MAX_WORD_LEN * sizeof(lsh_u32)]; - err = lsh256_final(&ctx, fullHash); + LSH256_Base_TruncatedFinal_CXX(m_state, copyOut ? fullHash : hash, size); + + if (copyOut) memcpy(hash, fullHash, size); - } - - if (err != LSH_SUCCESS) - throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_final failed"); Restart(); } diff --git a/lsh256_avx.cpp b/lsh256_avx.cpp new file mode 100644 index 00000000..d4f7c673 --- /dev/null +++ b/lsh256_avx.cpp @@ -0,0 +1,647 @@ +// lsh.cpp - written and placed in the public domain by Jeffrey Walton +// Based on the specification and source code provided by +// Korea Internet & Security Agency (KISA) website. Also +// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do +// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. + +// We are hitting some sort of GCC bug in the LSH AVX2 code path. +// Clang is OK on the AVX2 code path. We believe it is GCC Issue +// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It +// makes using zeroupper a little tricky. + +#include "pch.h" +#include "config.h" + +#include "lsh.h" +#include "misc.h" + +#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + +#if defined(CRYPTOPP_AVX2_AVAILABLE) +# include +# include +#endif + +#if defined(__GNUC__) && defined(__amd64__) +# include +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +/* LSH Constants */ + +const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128; +// const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024; +// const unsigned int LSH256_CV_BYTE_LEN = 64; +const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32; + +// const unsigned int MSG_BLK_WORD_LEN = 32; +const unsigned int CV_WORD_LEN = 16; +const unsigned int CONST_WORD_LEN = 8; +const unsigned int HASH_VAL_MAX_WORD_LEN = 8; +// const unsigned int WORD_BIT_LEN = 32; +const unsigned int NUM_STEPS = 26; + +const unsigned int ROT_EVEN_ALPHA = 29; +const unsigned int ROT_EVEN_BETA = 1; +const unsigned int ROT_ODD_ALPHA = 5; +const unsigned int ROT_ODD_BETA = 17; + +const unsigned int LSH_TYPE_256_256 = 0x0000020; +const unsigned int LSH_TYPE_256_224 = 0x000001C; + +// const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224; +// const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256; + +/* Error Code */ + +const unsigned int LSH_SUCCESS = 0x0; +// const unsigned int LSH_ERR_NULL_PTR = 0x2401; +// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402; +const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; +const unsigned int LSH_ERR_INVALID_STATE = 0x2404; + +/* Index into our state array */ + +const unsigned int AlgorithmType = 80; +const unsigned int RemainingBits = 81; + +NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(LSH) + +// lsh256.cpp +extern const word32 LSH256_IV224[CV_WORD_LEN]; +extern const word32 LSH256_IV256[CV_WORD_LEN]; +extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS]; + +NAMESPACE_END // LSH +NAMESPACE_END // Crypto++ + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::rotlFixed; +using CryptoPP::rotlConstant; + +using CryptoPP::GetBlock; +using CryptoPP::LittleEndian; +using CryptoPP::ConditionalByteReverse; +using CryptoPP::LITTLE_ENDIAN_ORDER; + +typedef byte lsh_u8; +typedef word32 lsh_u32; +typedef word32 lsh_uint; +typedef word32 lsh_err; +typedef word32 lsh_type; + +using CryptoPP::LSH::LSH256_IV224; +using CryptoPP::LSH::LSH256_IV256; +using CryptoPP::LSH::LSH256_StepConstants; + +struct LSH256_AVX2_Context +{ + LSH256_AVX2_Context(word32* state, word32 algType, word32& remainingBitLength) : + cv_l(state+0), cv_r(state+8), sub_msgs(state+16), + last_block(reinterpret_cast(state+48)), + remain_databitlen(remainingBitLength), + alg_type(static_cast(algType)) {} + + lsh_u32* cv_l; // start of our state block + lsh_u32* cv_r; + lsh_u32* sub_msgs; + lsh_u8* last_block; + lsh_u32& remain_databitlen; + lsh_type alg_type; +}; + +struct LSH256_AVX2_Internal +{ + LSH256_AVX2_Internal(word32* state) : + submsg_e_l(state+16), submsg_e_r(state+24), + submsg_o_l(state+32), submsg_o_r(state+40) { } + + lsh_u32* submsg_e_l; /* even left sub-message */ + lsh_u32* submsg_e_r; /* even right sub-message */ + lsh_u32* submsg_o_l; /* odd left sub-message */ + lsh_u32* submsg_o_r; /* odd right sub-message */ +}; + +// Zero the upper 128 bits of all YMM registers on exit. +// It avoids AVX state transition penalties when saving state. +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735 +// makes using zeroupper a little tricky. + +struct AVX_Cleanup +{ + ~AVX_Cleanup() { + _mm256_zeroupper(); + } +}; + +// const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; + +/* LSH AlgType Macro */ + +inline bool LSH_IS_LSH512(lsh_uint val) { + return (val & 0xf0000) == 0; +} + +inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { + return val >> 24; +} + +inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { + return val & 0xffff; +} + +inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { + return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); +} + +inline lsh_u32 loadLE32(lsh_u32 v) { + return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); +} + +lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) { + return rotlFixed(x, r); +} + +// Original code relied upon unaligned lsh_u32 buffer +inline void load_msg_blk(LSH256_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN]) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_e_l = i_state->submsg_e_l; + lsh_u32* submsg_e_r = i_state->submsg_e_r; + lsh_u32* submsg_o_l = i_state->submsg_o_l; + lsh_u32* submsg_o_r = i_state->submsg_o_r; + + _mm256_storeu_si256(M256_CAST(submsg_e_l+0), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+0))); + _mm256_storeu_si256(M256_CAST(submsg_e_r+0), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+32))); + _mm256_storeu_si256(M256_CAST(submsg_o_l+0), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+64))); + _mm256_storeu_si256(M256_CAST(submsg_o_r+0), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+96))); +} + +inline void msg_exp_even(LSH256_AVX2_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_e_l = i_state->submsg_e_l; + lsh_u32* submsg_e_r = i_state->submsg_e_r; + lsh_u32* submsg_o_l = i_state->submsg_o_l; + lsh_u32* submsg_o_r = i_state->submsg_o_r; + + const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514, + 0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c); + + _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi32( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), + _mm256_shuffle_epi8( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), mask))); + _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi32( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), + _mm256_shuffle_epi8( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), mask))); +} + +inline void msg_exp_odd(LSH256_AVX2_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_e_l = i_state->submsg_e_l; + lsh_u32* submsg_e_r = i_state->submsg_e_r; + lsh_u32* submsg_o_l = i_state->submsg_o_l; + lsh_u32* submsg_o_r = i_state->submsg_o_r; + + const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514, + 0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c); + + _mm256_storeu_si256(M256_CAST(submsg_o_l+0), _mm256_add_epi32( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), + _mm256_shuffle_epi8( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), mask))); + _mm256_storeu_si256(M256_CAST(submsg_o_r+0), _mm256_add_epi32( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), + _mm256_shuffle_epi8( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), mask))); +} + +inline void load_sc(const lsh_u32** p_const_v, size_t i) +{ + CRYPTOPP_ASSERT(p_const_v != NULLPTR); + + *p_const_v = &LSH256_StepConstants[i]; +} + +inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_e_l = i_state->submsg_e_l; + lsh_u32* submsg_e_r = i_state->submsg_e_r; + + _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)))); + _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)))); +} + +inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_o_l = i_state->submsg_o_l; + lsh_u32* submsg_o_r = i_state->submsg_o_r; + + _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l)))); + _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_r)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r)))); +} + +inline void add_blk(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) +{ + _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi32( + _mm256_loadu_si256(CONST_M256_CAST(cv_l)), + _mm256_loadu_si256(CONST_M256_CAST(cv_r)))); +} + +template +inline void rotate_blk(lsh_u32 cv[8]) +{ + _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( + _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), + _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R))); +} + +inline void xor_with_const(lsh_u32 cv_l[8], const lsh_u32 const_v[8]) +{ + _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l)), + _mm256_loadu_si256(CONST_M256_CAST(const_v)))); +} + +inline void rotate_msg_gamma(lsh_u32 cv_r[8]) +{ + // g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; + _mm256_storeu_si256(M256_CAST(cv_r+0), + _mm256_shuffle_epi8(_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), + _mm256_set_epi8( + /* hi lane */ 15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1, + /* lo lane */ 12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0))); +} + +inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) +{ + __m256i temp = _mm256_shuffle_epi32( + _mm256_loadu_si256(CONST_M256_CAST(cv_l)), _MM_SHUFFLE(3,1,0,2)); + _mm256_storeu_si256(M256_CAST(cv_r), + _mm256_shuffle_epi32( + _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(1,2,3,0))); + _mm256_storeu_si256(M256_CAST(cv_l), + _mm256_permute2x128_si256(temp, + _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,3,0,1))); + _mm256_storeu_si256(M256_CAST(cv_r), + _mm256_permute2x128_si256(temp, + _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,2,0,0))); +}; + +/* -------------------------------------------------------- * +* step function +* -------------------------------------------------------- */ + +template +inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8]) +{ + add_blk(cv_l, cv_r); + rotate_blk(cv_l); + xor_with_const(cv_l, const_v); + add_blk(cv_r, cv_l); + rotate_blk(cv_r); + add_blk(cv_l, cv_r); + rotate_msg_gamma(cv_r); +} + +/* -------------------------------------------------------- * +* compression function +* -------------------------------------------------------- */ + +inline void compress(LSH256_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN]) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + LSH256_AVX2_Internal s_state(ctx->cv_l); + LSH256_AVX2_Internal* i_state = &s_state; + + const lsh_u32* const_v = NULL; + lsh_u32* cv_l = ctx->cv_l; + lsh_u32* cv_r = ctx->cv_r; + + load_msg_blk(i_state, pdMsgBlk); + + msg_add_even(cv_l, cv_r, i_state); + load_sc(&const_v, 0); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + msg_add_odd(cv_l, cv_r, i_state); + load_sc(&const_v, 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + for (size_t i = 1; i < NUM_STEPS / 2; i++) + { + msg_exp_even(i_state); + msg_add_even(cv_l, cv_r, i_state); + load_sc(&const_v, 16 * i); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + msg_exp_odd(i_state); + msg_add_odd(cv_l, cv_r, i_state); + load_sc(&const_v, 16 * i + 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + } + + msg_exp_even(i_state); + msg_add_even(cv_l, cv_r, i_state); +} + +/* -------------------------------------------------------- */ + +inline void load_iv(word32 cv_l[8], word32 cv_r[8], const word32 iv[16]) +{ + // The IV's are 32-byte aligned so we can use aligned loads. + _mm256_storeu_si256(M256_CAST(cv_l+0), + _mm256_load_si256(CONST_M256_CAST(iv+0))); + _mm256_storeu_si256(M256_CAST(cv_r+0), + _mm256_load_si256(CONST_M256_CAST(iv+8))); +} + +inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) +{ + _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256()); +} + +inline void zero_submsgs(LSH256_AVX2_Context* ctx) +{ + lsh_u32* sub_msgs = ctx->sub_msgs; + + _mm256_storeu_si256(M256_CAST(sub_msgs+ 0), _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(sub_msgs+ 8), _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(sub_msgs+16), _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(sub_msgs+24), _mm256_setzero_si256()); +} + +inline void init224(LSH256_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224); +} + +inline void init256(LSH256_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256); +} + +/* -------------------------------------------------------- */ + +inline void fin(LSH256_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)), + _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0)))); +} + +/* -------------------------------------------------------- */ + +inline void get_hash(LSH256_AVX2_Context* ctx, lsh_u8* pbHashVal) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + CRYPTOPP_ASSERT(pbHashVal != NULLPTR); + + lsh_uint alg_type = ctx->alg_type; + lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); + lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); + + // Multiplying by looks odd... + memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); + if (hash_val_bit_len){ + pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); + } +} + +/* -------------------------------------------------------- */ + +lsh_err lsh256_init_avx2(LSH256_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + + lsh_u32 alg_type = ctx->alg_type; + const lsh_u32* const_v = NULL; + ctx->remain_databitlen = 0; + + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. + AVX_Cleanup cleanup; + + switch (alg_type) + { + case LSH_TYPE_256_256: + init256(ctx); + return LSH_SUCCESS; + case LSH_TYPE_256_224: + init224(ctx); + return LSH_SUCCESS; + default: + break; + } + + lsh_u32* cv_l = ctx->cv_l; + lsh_u32* cv_r = ctx->cv_r; + + zero_iv(cv_l, cv_r); + cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN; + cv_l[1] = LSH_GET_HASHBIT(alg_type); + + for (size_t i = 0; i < NUM_STEPS / 2; i++) + { + //Mix + load_sc(&const_v, i * 16); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + load_sc(&const_v, i * 16 + 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + } + + return LSH_SUCCESS; +} + +lsh_err lsh256_update_avx2(LSH256_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(data != NULLPTR); + CRYPTOPP_ASSERT(databitlen % 8 == 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. + AVX_Cleanup cleanup; + + if (databitlen == 0){ + return LSH_SUCCESS; + } + + // We are byte oriented. tail bits will always be 0. + size_t databytelen = databitlen >> 3; + // lsh_uint pos2 = databitlen & 0x7; + const size_t pos2 = 0; + + size_t remain_msg_byte = ctx->remain_databitlen >> 3; + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; + + if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ + return LSH_ERR_INVALID_STATE; + } + if (remain_msg_bit > 0){ + return LSH_ERR_INVALID_DATABITLEN; + } + + if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN) + { + memcpy(ctx->last_block + remain_msg_byte, data, databytelen); + ctx->remain_databitlen += (lsh_uint)databitlen; + remain_msg_byte += (lsh_uint)databytelen; + if (pos2){ + ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff); + } + return LSH_SUCCESS; + } + + if (remain_msg_byte > 0){ + size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte; + memcpy(ctx->last_block + remain_msg_byte, data, more_byte); + compress(ctx, ctx->last_block); + data += more_byte; + databytelen -= more_byte; + remain_msg_byte = 0; + ctx->remain_databitlen = 0; + } + + while (databytelen >= LSH256_MSG_BLK_BYTE_LEN) + { + // This call to compress caused some trouble. + // The data pointer can become unaligned in the + // previous block. + compress(ctx, data); + data += LSH256_MSG_BLK_BYTE_LEN; + databytelen -= LSH256_MSG_BLK_BYTE_LEN; + } + + if (databytelen > 0){ + memcpy(ctx->last_block, data, databytelen); + ctx->remain_databitlen = (lsh_uint)(databytelen << 3); + } + + if (pos2){ + ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff); + ctx->remain_databitlen += pos2; + } + + return LSH_SUCCESS; +} + +lsh_err lsh256_final_avx2(LSH256_AVX2_Context* ctx, lsh_u8* hashval) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(hashval != NULLPTR); + + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. + AVX_Cleanup cleanup; + + // We are byte oriented. tail bits will always be 0. + size_t remain_msg_byte = ctx->remain_databitlen >> 3; + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; + + if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ + return LSH_ERR_INVALID_STATE; + } + + if (remain_msg_bit){ + ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit)); + } + else{ + ctx->last_block[remain_msg_byte] = 0x80; + } + memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1); + + compress(ctx, ctx->last_block); + + fin(ctx); + get_hash(ctx, hashval); + + return LSH_SUCCESS; +} + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +extern +void LSH256_Base_Restart_AVX2(word32* state) +{ + state[RemainingBits] = 0; + LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh256_init_avx2(&ctx); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_init_avx2 failed"); +} + +extern +void LSH256_Base_Update_AVX2(word32* state, const byte *input, size_t size) +{ + LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh256_update_avx2(&ctx, input, 8*size); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_update_avx2 failed"); +} + +extern +void LSH256_Base_TruncatedFinal_AVX2(word32* state, byte *hash, size_t) +{ + LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh256_final_avx2(&ctx, hash); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_final_avx2 failed"); +} + +NAMESPACE_END + +#endif // CRYPTOPP_AVX2_AVAILABLE diff --git a/lsh256_sse.cpp b/lsh256_sse.cpp new file mode 100644 index 00000000..827d46f4 --- /dev/null +++ b/lsh256_sse.cpp @@ -0,0 +1,709 @@ +// lsh.cpp - written and placed in the public domain by Jeffrey Walton +// Based on the specification and source code provided by +// Korea Internet & Security Agency (KISA) website. Also +// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do +// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. + +// We are hitting some sort of GCC bug in the LSH AVX2 code path. +// Clang is OK on the AVX2 code path. We believe it is GCC Issue +// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It +// makes using zeroupper a little tricky. + +#include "pch.h" +#include "config.h" + +#include "lsh.h" +#include "cpu.h" +#include "misc.h" + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +# include +# include +#endif + +#if defined(CRYPTOPP_XOP_AVAILABLE) +# include +#endif + +#if defined(__GNUC__) && defined(__amd64__) +# include +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +/* LSH Constants */ + +const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128; +// const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024; +// const unsigned int LSH256_CV_BYTE_LEN = 64; +const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32; + +// const unsigned int MSG_BLK_WORD_LEN = 32; +const unsigned int CV_WORD_LEN = 16; +const unsigned int CONST_WORD_LEN = 8; +const unsigned int HASH_VAL_MAX_WORD_LEN = 8; +// const unsigned int WORD_BIT_LEN = 32; +const unsigned int NUM_STEPS = 26; + +const unsigned int ROT_EVEN_ALPHA = 29; +const unsigned int ROT_EVEN_BETA = 1; +const unsigned int ROT_ODD_ALPHA = 5; +const unsigned int ROT_ODD_BETA = 17; + +const unsigned int LSH_TYPE_256_256 = 0x0000020; +const unsigned int LSH_TYPE_256_224 = 0x000001C; + +// const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224; +// const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256; + +/* Error Code */ + +const unsigned int LSH_SUCCESS = 0x0; +// const unsigned int LSH_ERR_NULL_PTR = 0x2401; +// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402; +const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; +const unsigned int LSH_ERR_INVALID_STATE = 0x2404; + +/* Index into our state array */ + +const unsigned int AlgorithmType = 80; +const unsigned int RemainingBits = 81; + +NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(LSH) + +// lsh256.cpp +extern const word32 LSH256_IV224[CV_WORD_LEN]; +extern const word32 LSH256_IV256[CV_WORD_LEN]; +extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS]; + +NAMESPACE_END // LSH +NAMESPACE_END // Crypto++ + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::rotlFixed; +using CryptoPP::rotlConstant; + +using CryptoPP::GetBlock; +using CryptoPP::LittleEndian; +using CryptoPP::ConditionalByteReverse; +using CryptoPP::LITTLE_ENDIAN_ORDER; + +typedef byte lsh_u8; +typedef word32 lsh_u32; +typedef word32 lsh_uint; +typedef word32 lsh_err; +typedef word32 lsh_type; + +using CryptoPP::LSH::LSH256_IV224; +using CryptoPP::LSH::LSH256_IV256; +using CryptoPP::LSH::LSH256_StepConstants; + +struct LSH256_SSSE3_Context +{ + LSH256_SSSE3_Context(word32* state, word32 algType, word32& remainingBitLength) : + cv_l(state+0), cv_r(state+8), sub_msgs(state+16), + last_block(reinterpret_cast(state+48)), + remain_databitlen(remainingBitLength), + alg_type(static_cast(algType)) {} + + lsh_u32* cv_l; // start of our state block + lsh_u32* cv_r; + lsh_u32* sub_msgs; + lsh_u8* last_block; + lsh_u32& remain_databitlen; + lsh_type alg_type; +}; + +struct LSH256_SSSE3_Internal +{ + LSH256_SSSE3_Internal(word32* state) : + submsg_e_l(state+16), submsg_e_r(state+24), + submsg_o_l(state+32), submsg_o_r(state+40) { } + + lsh_u32* submsg_e_l; /* even left sub-message */ + lsh_u32* submsg_e_r; /* even right sub-message */ + lsh_u32* submsg_o_l; /* odd left sub-message */ + lsh_u32* submsg_o_r; /* odd right sub-message */ +}; + +const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; + +/* LSH AlgType Macro */ + +inline bool LSH_IS_LSH512(lsh_uint val) { + return (val & 0xf0000) == 0; +} + +inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { + return val >> 24; +} + +inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { + return val & 0xffff; +} + +inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { + return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); +} + +inline lsh_u32 loadLE32(lsh_u32 v) { + return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); +} + +lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) { + return rotlFixed(x, r); +} + +// Original code relied upon unaligned lsh_u32 buffer +inline void load_msg_blk(LSH256_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN]) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + lsh_u32* submsg_e_l = i_state->submsg_e_l; + lsh_u32* submsg_e_r = i_state->submsg_e_r; + lsh_u32* submsg_o_l = i_state->submsg_o_l; + lsh_u32* submsg_o_r = i_state->submsg_o_r; + + _mm_storeu_si128(M128_CAST(submsg_e_l+0), + _mm_loadu_si128(CONST_M128_CAST(msgblk+0))); + _mm_storeu_si128(M128_CAST(submsg_e_l+4), + _mm_loadu_si128(CONST_M128_CAST(msgblk+16))); + _mm_storeu_si128(M128_CAST(submsg_e_r+0), + _mm_loadu_si128(CONST_M128_CAST(msgblk+32))); + _mm_storeu_si128(M128_CAST(submsg_e_r+4), + _mm_loadu_si128(CONST_M128_CAST(msgblk+48))); + _mm_storeu_si128(M128_CAST(submsg_o_l+0), + _mm_loadu_si128(CONST_M128_CAST(msgblk+64))); + _mm_storeu_si128(M128_CAST(submsg_o_l+4), + _mm_loadu_si128(CONST_M128_CAST(msgblk+80))); + _mm_storeu_si128(M128_CAST(submsg_o_r+0), + _mm_loadu_si128(CONST_M128_CAST(msgblk+96))); + _mm_storeu_si128(M128_CAST(submsg_o_r+4), + _mm_loadu_si128(CONST_M128_CAST(msgblk+112))); +} + +inline void msg_exp_even(LSH256_SSSE3_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_e_l = i_state->submsg_e_l; + lsh_u32* submsg_e_r = i_state->submsg_e_r; + lsh_u32* submsg_o_l = i_state->submsg_o_l; + lsh_u32* submsg_o_r = i_state->submsg_o_r; + + _mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi32( + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(3,2,1,0)), + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(1,0,2,3)))); + + _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi32( + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(3,2,1,0)), + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(2,1,0,3)))); + + _mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi32( + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(3,2,1,0)), + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(1,0,2,3)))); + + _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi32( + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(3,2,1,0)), + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(2,1,0,3)))); +} + +inline void msg_exp_odd(LSH256_SSSE3_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_e_l = i_state->submsg_e_l; + lsh_u32* submsg_e_r = i_state->submsg_e_r; + lsh_u32* submsg_o_l = i_state->submsg_o_l; + lsh_u32* submsg_o_r = i_state->submsg_o_r; + + _mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi32( + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(3,2,1,0)), + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(1,0,2,3)))); + + _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi32( + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(3,2,1,0)), + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(2,1,0,3)))); + + _mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi32( + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(3,2,1,0)), + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(1,0,2,3)))); + + _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi32( + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(3,2,1,0)), + _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(2,1,0,3)))); +} + +inline void load_sc(const lsh_u32** p_const_v, size_t i) +{ + CRYPTOPP_ASSERT(p_const_v != NULLPTR); + + *p_const_v = &LSH256_StepConstants[i]; +} + +inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_e_l = i_state->submsg_e_l; + lsh_u32* submsg_e_r = i_state->submsg_e_r; + + _mm_storeu_si128(M128_CAST(cv_l+0), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+0)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); + _mm_storeu_si128(M128_CAST(cv_r+0), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)))); + _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); +} + +inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u32* submsg_o_l = i_state->submsg_o_l; + lsh_u32* submsg_o_r = i_state->submsg_o_r; + + _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l)))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); + _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r)))); + _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); +} + +inline void add_blk(lsh_u32 cv_l[8], const lsh_u32 cv_r[8]) +{ + _mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi32( + _mm_loadu_si128(CONST_M128_CAST(cv_l)), + _mm_loadu_si128(CONST_M128_CAST(cv_r)))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi32( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(cv_r+4)))); +} + +template +inline void rotate_blk(lsh_u32 cv[8]) +{ +#if defined(CRYPTOPP_XOP_AVAILABLE) + _mm_storeu_si128(M128_CAST(cv), + _mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R)); + _mm_storeu_si128(M128_CAST(cv+4), + _mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R)); +#else + _mm_storeu_si128(M128_CAST(cv), _mm_or_si128( + _mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R), + _mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), 32-R))); + _mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128( + _mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R), + _mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 32-R))); +#endif +} + +inline void xor_with_const(lsh_u32* cv_l, const lsh_u32* const_v) +{ + _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l)), + _mm_loadu_si128(CONST_M128_CAST(const_v)))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(const_v+4)))); +} + +inline void rotate_msg_gamma(lsh_u32 cv_r[8]) +{ + // g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; + _mm_storeu_si128(M128_CAST(cv_r+0), + _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), + _mm_set_epi8(12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0))); + _mm_storeu_si128(M128_CAST(cv_r+4), + _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), + _mm_set_epi8(15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1))); +} + +inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) +{ + _mm_storeu_si128(M128_CAST(cv_l+0), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2))); + _mm_storeu_si128(M128_CAST(cv_r+0), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0))); + _mm_storeu_si128(M128_CAST(cv_r+4), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0))); + + __m128i temp = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); + _mm_storeu_si128(M128_CAST(cv_l+0), + _mm_loadu_si128(CONST_M128_CAST(cv_l+4))); + _mm_storeu_si128(M128_CAST(cv_l+4), + _mm_loadu_si128(CONST_M128_CAST(cv_r+4))); + _mm_storeu_si128(M128_CAST(cv_r+4), + _mm_loadu_si128(CONST_M128_CAST(cv_r+0))); + _mm_storeu_si128(M128_CAST(cv_r+0), temp); +}; + +/* -------------------------------------------------------- * +* step function +* -------------------------------------------------------- */ + +template +inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8]) +{ + add_blk(cv_l, cv_r); + rotate_blk(cv_l); + xor_with_const(cv_l, const_v); + add_blk(cv_r, cv_l); + rotate_blk(cv_r); + add_blk(cv_l, cv_r); + rotate_msg_gamma(cv_r); +} + +/* -------------------------------------------------------- * +* compression function +* -------------------------------------------------------- */ + +inline void compress(LSH256_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN]) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + LSH256_SSSE3_Internal s_state(ctx->cv_l); + LSH256_SSSE3_Internal* i_state = &s_state; + + const lsh_u32* const_v = NULL; + lsh_u32* cv_l = ctx->cv_l; + lsh_u32* cv_r = ctx->cv_r; + + load_msg_blk(i_state, pdMsgBlk); + + msg_add_even(cv_l, cv_r, i_state); + load_sc(&const_v, 0); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + msg_add_odd(cv_l, cv_r, i_state); + load_sc(&const_v, 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + for (size_t i = 1; i < NUM_STEPS / 2; i++) + { + msg_exp_even(i_state); + msg_add_even(cv_l, cv_r, i_state); + load_sc(&const_v, 16 * i); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + msg_exp_odd(i_state); + msg_add_odd(cv_l, cv_r, i_state); + load_sc(&const_v, 16 * i + 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + } + + msg_exp_even(i_state); + msg_add_even(cv_l, cv_r, i_state); +} + +/* -------------------------------------------------------- */ + +inline void load_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 iv[16]) +{ + _mm_storeu_si128(M128_CAST(cv_l+ 0), + _mm_load_si128(CONST_M128_CAST(iv+ 0))); + _mm_storeu_si128(M128_CAST(cv_l+ 4), + _mm_load_si128(CONST_M128_CAST(iv+ 4))); + _mm_storeu_si128(M128_CAST(cv_r+ 0), + _mm_load_si128(CONST_M128_CAST(iv+ 8))); + _mm_storeu_si128(M128_CAST(cv_r+ 4), + _mm_load_si128(CONST_M128_CAST(iv+12))); +} + +inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) +{ + _mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128()); +} + +inline void zero_submsgs(LSH256_SSSE3_Context* ctx) +{ + lsh_u32* sub_msgs = ctx->sub_msgs; + + _mm_storeu_si128(M128_CAST(sub_msgs+ 0), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+ 4), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+ 8), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+12), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+16), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+20), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+24), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+28), _mm_setzero_si128()); +} + +inline void init224(LSH256_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224); +} + +inline void init256(LSH256_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256); +} + +/* -------------------------------------------------------- */ + +inline void fin(LSH256_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + _mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)), + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0)))); + _mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4)))); +} + +/* -------------------------------------------------------- */ + +inline void get_hash(LSH256_SSSE3_Context* ctx, lsh_u8* pbHashVal) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + CRYPTOPP_ASSERT(pbHashVal != NULLPTR); + + lsh_uint alg_type = ctx->alg_type; + lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); + lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); + + // Multiplying by sizeof(lsh_u8) looks odd... + memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); + if (hash_val_bit_len){ + pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); + } +} + +/* -------------------------------------------------------- */ + +lsh_err lsh256_ssse3_init(LSH256_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + + lsh_u32 alg_type = ctx->alg_type; + const lsh_u32* const_v = NULL; + ctx->remain_databitlen = 0; + + switch (alg_type) + { + case LSH_TYPE_256_256: + init256(ctx); + return LSH_SUCCESS; + case LSH_TYPE_256_224: + init224(ctx); + return LSH_SUCCESS; + default: + break; + } + + lsh_u32* cv_l = ctx->cv_l; + lsh_u32* cv_r = ctx->cv_r; + + zero_iv(cv_l, cv_r); + cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN; + cv_l[1] = LSH_GET_HASHBIT(alg_type); + + for (size_t i = 0; i < NUM_STEPS / 2; i++) + { + //Mix + load_sc(&const_v, i * 16); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + load_sc(&const_v, i * 16 + 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + } + + return LSH_SUCCESS; +} + +lsh_err lsh256_ssse3_update(LSH256_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(data != NULLPTR); + CRYPTOPP_ASSERT(databitlen % 8 == 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + + if (databitlen == 0){ + return LSH_SUCCESS; + } + + // We are byte oriented. tail bits will always be 0. + size_t databytelen = databitlen >> 3; + // lsh_uint pos2 = databitlen & 0x7; + const size_t pos2 = 0; + + size_t remain_msg_byte = ctx->remain_databitlen >> 3; + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; + + if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ + return LSH_ERR_INVALID_STATE; + } + if (remain_msg_bit > 0){ + return LSH_ERR_INVALID_DATABITLEN; + } + + if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN) + { + memcpy(ctx->last_block + remain_msg_byte, data, databytelen); + ctx->remain_databitlen += (lsh_uint)databitlen; + remain_msg_byte += (lsh_uint)databytelen; + if (pos2){ + ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff); + } + return LSH_SUCCESS; + } + + if (remain_msg_byte > 0){ + size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte; + memcpy(ctx->last_block + remain_msg_byte, data, more_byte); + compress(ctx, ctx->last_block); + data += more_byte; + databytelen -= more_byte; + remain_msg_byte = 0; + ctx->remain_databitlen = 0; + } + + while (databytelen >= LSH256_MSG_BLK_BYTE_LEN) + { + // This call to compress caused some trouble. + // The data pointer can become unaligned in the + // previous block. + compress(ctx, data); + data += LSH256_MSG_BLK_BYTE_LEN; + databytelen -= LSH256_MSG_BLK_BYTE_LEN; + } + + if (databytelen > 0){ + memcpy(ctx->last_block, data, databytelen); + ctx->remain_databitlen = (lsh_uint)(databytelen << 3); + } + + if (pos2){ + ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff); + ctx->remain_databitlen += pos2; + } + + return LSH_SUCCESS; +} + +lsh_err lsh256_ssse3_final(LSH256_SSSE3_Context* ctx, lsh_u8* hashval) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(hashval != NULLPTR); + + // We are byte oriented. tail bits will always be 0. + size_t remain_msg_byte = ctx->remain_databitlen >> 3; + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; + + if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ + return LSH_ERR_INVALID_STATE; + } + + if (remain_msg_bit){ + ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit)); + } + else{ + ctx->last_block[remain_msg_byte] = 0x80; + } + memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1); + + compress(ctx, ctx->last_block); + + fin(ctx); + get_hash(ctx, hashval); + + return LSH_SUCCESS; +} + +ANONYMOUS_NAMESPACE_END // Anonymous + +NAMESPACE_BEGIN(CryptoPP) + +extern +void LSH256_Base_Restart_SSSE3(word32* state) +{ + state[RemainingBits] = 0; + LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh256_ssse3_init(&ctx); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_init failed"); +} + +extern +void LSH256_Base_Update_SSSE3(word32* state, const byte *input, size_t size) +{ + LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh256_ssse3_update(&ctx, input, 8*size); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_update failed"); +} + +extern +void LSH256_Base_TruncatedFinal_SSSE3(word32* state, byte *hash, size_t) +{ + LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh256_ssse3_final(&ctx, hash); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_final failed"); +} + +NAMESPACE_END + +#endif // CRYPTOPP_SSSE3_AVAILABLE diff --git a/lsh512.cpp b/lsh512.cpp index a810b616..8f24f3ba 100644 --- a/lsh512.cpp +++ b/lsh512.cpp @@ -4,142 +4,21 @@ // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. -// The source file below uses GCC's function multiversioning to -// speed up a rotate. When the rotate is performed with the SSE -// unit there's a 2.5 to 3.0 cpb profit. AVX and AVX2 code paths -// slow down with multiversioning. It looks like GCC inserts calls -// to zeroupper() in each AVX function rather than deferring until -// the end of Restart(), Update() or Final(). That mistake costs -// about 3 cpb. +// We are hitting some sort of GCC bug in the LSH AVX2 code path. +// Clang is OK on the AVX2 code path. We believe it is GCC Issue +// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It +// makes using zeroupper a little tricky. -// Function multiversioning does not work with Clang. Enabling it for -// LLVM Clang 7.0 and above resulted in linker errors. Also see -// https://bugs.llvm.org/show_bug.cgi?id=50025. - -// We are hitting some sort of GCC bug in the LSH256 AVX2 code path. -// Clang is OK on the AVX2 code path. When we enable AVX2 for -// rotate_msg_gamma, msg_exp_even and msg_exp_odd, then GCC arrives -// at the wrong result. Making any one of the functions SSE2 clears -// the problem. See CRYPTOPP_WORKAROUND_AVX2_BUG below. - -// TODO: cut-over to a *_simd.cpp file for proper runtime dispatching. #include "pch.h" #include "config.h" #include "lsh.h" +#include "cpu.h" #include "misc.h" -// Only enable the intrinsics for 64-bit machines -#ifndef CRYPTOPP_DISABLE_ASM -# if (defined(__SSE2__) && defined(__amd64__)) || (defined(_MSC_VER) && defined(_M_X64)) -# define CRYPTOPP_LSH512_SSE2_AVAILABLE 1 -# endif -# if defined(__SSSE3__) && defined(__amd64__) -# define CRYPTOPP_LSH512_SSSE3_AVAILABLE 1 -# endif -# if defined(__XOP__) && defined(__amd64__) -# define CRYPTOPP_LSH512_XOP_AVAILABLE 1 -# endif -# if defined(__AVX__) && defined(__amd64__) -# define CRYPTOPP_LSH512_AVX_AVAILABLE 1 -# endif -# if defined(__AVX2__) && defined(__amd64__) -# define CRYPTOPP_LSH512_AVX2_AVAILABLE 1 -# endif -#endif - -#if defined(CRYPTOPP_LSH512_SSSE3_AVAILABLE) -# include -#endif - -#if defined(CRYPTOPP_LSH512_XOP_AVAILABLE) -# include -#endif - -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) -# include -#endif - -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) -# include -#endif - -#if defined(CRYPTOPP_HAVE_ATTRIBUTE_TARGET) -# include -#endif - -#if defined(__GNUC__) && defined(__amd64__) -# include -#endif - -// Use GCC_VERSION to avoid Clang, ICC and other impostors -#if defined(CRYPTOPP_GCC_VERSION) -# define CRYPTOPP_WORKAROUND_AVX2_BUG 1 -#endif - ANONYMOUS_NAMESPACE_BEGIN -using CryptoPP::byte; -using CryptoPP::word32; -using CryptoPP::word64; -using CryptoPP::rotlFixed; -using CryptoPP::rotlConstant; - -using CryptoPP::GetBlock; -using CryptoPP::LittleEndian; -using CryptoPP::ConditionalByteReverse; -using CryptoPP::LITTLE_ENDIAN_ORDER; - -typedef byte lsh_u8; -typedef word32 lsh_u32; -typedef word64 lsh_u64; -typedef word32 lsh_uint; -typedef word32 lsh_err; -typedef word32 lsh_type; - -struct LSH512_Context -{ - LSH512_Context(word64* state, word32 algType, word32& remainingBitLength) : - cv_l(state+0), cv_r(state+8), sub_msgs(state+16), - last_block(reinterpret_cast(state+48)) , - remain_databitlen(remainingBitLength), algtype(algType) {} - - lsh_u64* cv_l; // start of our state block - lsh_u64* cv_r; - lsh_u64* sub_msgs; - lsh_u8* last_block; - lsh_u32& remain_databitlen; - lsh_type algtype; -}; - -struct LSH512_Internal -{ - LSH512_Internal(word64* state) : - submsg_e_l(state+16), submsg_e_r(state+24), - submsg_o_l(state+32), submsg_o_r(state+40) { } - - lsh_u64* submsg_e_l; /* even left sub-message */ - lsh_u64* submsg_e_r; /* even right sub-message */ - lsh_u64* submsg_o_l; /* odd left sub-message */ - lsh_u64* submsg_o_r; /* odd right sub-message */ -}; - -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) -// Zero the upper 128 bits of all YMM registers -// on entry and exit. It avoids AVX state -// transition penalties when saving state. -struct AVX_Cleanup -{ - AVX_Cleanup() { - _mm256_zeroupper(); - } - ~AVX_Cleanup() { - _mm256_zeroupper(); - } -}; -#endif - /* LSH Constants */ const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256; @@ -151,7 +30,6 @@ const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64; const unsigned int CV_WORD_LEN = 16; const unsigned int CONST_WORD_LEN = 8; const unsigned int HASH_VAL_MAX_WORD_LEN = 8; -// const unsigned int WORD_BIT_LEN = 64; const unsigned int NUM_STEPS = 28; const unsigned int ROT_EVEN_ALPHA = 23; @@ -175,38 +53,29 @@ const unsigned int LSH_SUCCESS = 0x0; const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; const unsigned int LSH_ERR_INVALID_STATE = 0x2404; -/* LSH AlgType Macro */ +/* Index into our state array */ -inline bool LSH_IS_LSH512(lsh_uint val) { - return (val & 0xf0000) == 0x10000; -} +const unsigned int AlgorithmType = 80; +const unsigned int RemainingBits = 81; -inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { - return val >> 24; -} +NAMESPACE_END -inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { - return val & 0xffff; -} - -inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { - return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); -} - -inline lsh_u64 loadLE64(lsh_u64 v) { - return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); -} - -lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) { - return rotlFixed(x, r); -} +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(LSH) /* -------------------------------------------------------- * * LSH: iv * -------------------------------------------------------- */ +//extern const word64 LSH512_IV224[CV_WORD_LEN]; +//extern const word64 LSH512_IV256[CV_WORD_LEN]; +//extern const word64 LSH512_IV384[CV_WORD_LEN]; +//extern const word64 LSH512_IV512[CV_WORD_LEN]; +//extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS]; + CRYPTOPP_ALIGN_DATA(32) -const lsh_u64 g_IV224[CV_WORD_LEN] = { +extern +const word64 LSH512_IV224[CV_WORD_LEN] = { W64LIT(0x0C401E9FE8813A55), W64LIT(0x4A5F446268FD3D35), W64LIT(0xFF13E452334F612A), W64LIT(0xF8227661037E354A), W64LIT(0xA5F223723C9CA29D), W64LIT(0x95D965A11AED3979), W64LIT(0x01E23835B9AB02CC), W64LIT(0x52D49CBAD5B30616), W64LIT(0x9E5C2027773F4ED3), W64LIT(0x66A5C8801925B701), W64LIT(0x22BBC85B4C6779D9), W64LIT(0xC13171A42C559C23), @@ -214,7 +83,8 @@ const lsh_u64 g_IV224[CV_WORD_LEN] = { }; CRYPTOPP_ALIGN_DATA(32) -const lsh_u64 g_IV256[CV_WORD_LEN] = { +extern +const word64 LSH512_IV256[CV_WORD_LEN] = { W64LIT(0x6DC57C33DF989423), W64LIT(0xD8EA7F6E8342C199), W64LIT(0x76DF8356F8603AC4), W64LIT(0x40F1B44DE838223A), W64LIT(0x39FFE7CFC31484CD), W64LIT(0x39C4326CC5281548), W64LIT(0x8A2FF85A346045D8), W64LIT(0xFF202AA46DBDD61E), W64LIT(0xCF785B3CD5FCDB8B), W64LIT(0x1F0323B64A8150BF), W64LIT(0xFF75D972F29EA355), W64LIT(0x2E567F30BF1CA9E1), @@ -222,7 +92,8 @@ const lsh_u64 g_IV256[CV_WORD_LEN] = { }; CRYPTOPP_ALIGN_DATA(32) -const lsh_u64 g_IV384[CV_WORD_LEN] = { +extern +const word64 LSH512_IV384[CV_WORD_LEN] = { W64LIT(0x53156A66292808F6), W64LIT(0xB2C4F362B204C2BC), W64LIT(0xB84B7213BFA05C4E), W64LIT(0x976CEB7C1B299F73), W64LIT(0xDF0CC63C0570AE97), W64LIT(0xDA4441BAA486CE3F), W64LIT(0x6559F5D9B5F2ACC2), W64LIT(0x22DACF19B4B52A16), W64LIT(0xBBCDACEFDE80953A), W64LIT(0xC9891A2879725B3E), W64LIT(0x7C9FE6330237E440), W64LIT(0xA30BA550553F7431), @@ -230,20 +101,20 @@ const lsh_u64 g_IV384[CV_WORD_LEN] = { }; CRYPTOPP_ALIGN_DATA(32) -const lsh_u64 g_IV512[CV_WORD_LEN] = { +extern +const word64 LSH512_IV512[CV_WORD_LEN] = { W64LIT(0xadd50f3c7f07094e), W64LIT(0xe3f3cee8f9418a4f), W64LIT(0xb527ecde5b3d0ae9), W64LIT(0x2ef6dec68076f501), W64LIT(0x8cb994cae5aca216), W64LIT(0xfbb9eae4bba48cc7), W64LIT(0x650a526174725fea), W64LIT(0x1f9a61a73f8d8085), W64LIT(0xb6607378173b539b), W64LIT(0x1bc99853b0c0b9ed), W64LIT(0xdf727fc19b182d47), W64LIT(0xdbef360cf893a457), W64LIT(0x4981f5e570147e80), W64LIT(0xd00c4490ca7d3e30), W64LIT(0x5d73940c0e4ae1ec), W64LIT(0x894085e2edb2d819) }; -const lsh_uint g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; - /* -------------------------------------------------------- * * LSH: step constants * -------------------------------------------------------- */ -const lsh_u64 g_StepConstants[CONST_WORD_LEN * NUM_STEPS] = { +extern +const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS] = { W64LIT(0x97884283c938982a), W64LIT(0xba1fca93533e2355), W64LIT(0xc519a2e87aeb1c03), W64LIT(0x9a0fc95462af17b1), W64LIT(0xfc3dda8ab019a82b), W64LIT(0x02825d079a895407), W64LIT(0x79f2d0a7ee06a6f7), W64LIT(0xd76d15eed9fdf5fe), W64LIT(0x1fcac64d01d0c2c1), W64LIT(0xd9ea5de69161790f), W64LIT(0xdebc8b6366071fc8), W64LIT(0xa9d91db711c6c94b), @@ -302,72 +173,99 @@ const lsh_u64 g_StepConstants[CONST_WORD_LEN * NUM_STEPS] = { W64LIT(0x682f81c73efdda0d), W64LIT(0x2fb55925d71d268d), W64LIT(0xcc392d2901e58a3d), W64LIT(0xaa666ab975724a42) }; +NAMESPACE_END // LSH +NAMESPACE_END // Crypto++ + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::rotlFixed; +using CryptoPP::rotlConstant; + +using CryptoPP::GetBlock; +using CryptoPP::LittleEndian; +using CryptoPP::ConditionalByteReverse; +using CryptoPP::LITTLE_ENDIAN_ORDER; + +using CryptoPP::LSH::LSH512_IV224; +using CryptoPP::LSH::LSH512_IV256; +using CryptoPP::LSH::LSH512_IV384; +using CryptoPP::LSH::LSH512_IV512; +using CryptoPP::LSH::LSH512_StepConstants; + +typedef byte lsh_u8; +typedef word32 lsh_u32; +typedef word64 lsh_u64; +typedef word32 lsh_uint; +typedef word32 lsh_err; +typedef word32 lsh_type; + +struct LSH512_Context +{ + LSH512_Context(word64* state, word64 algType, word64& remainingBitLength) : + cv_l(state+0), cv_r(state+8), sub_msgs(state+16), + last_block(reinterpret_cast(state+48)), + remain_databitlen(remainingBitLength), + alg_type(static_cast(algType)) {} + + lsh_u64* cv_l; // start of our state block + lsh_u64* cv_r; + lsh_u64* sub_msgs; + lsh_u8* last_block; + lsh_u64& remain_databitlen; + lsh_type alg_type; +}; + +struct LSH512_Internal +{ + LSH512_Internal(word64* state) : + submsg_e_l(state+16), submsg_e_r(state+24), + submsg_o_l(state+32), submsg_o_r(state+40) { } + + lsh_u64* submsg_e_l; /* even left sub-message */ + lsh_u64* submsg_e_r; /* even right sub-message */ + lsh_u64* submsg_o_l; /* odd left sub-message */ + lsh_u64* submsg_o_r; /* odd right sub-message */ +}; + +const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; + +/* LSH AlgType Macro */ + +inline bool LSH_IS_LSH512(lsh_uint val) { + return (val & 0xf0000) == 0x10000; +} + +inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { + return val >> 24; +} + +inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { + return val & 0xffff; +} + +inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { + return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); +} + +inline lsh_u64 loadLE64(lsh_u64 v) { + return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); +} + +lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) { + return rotlFixed(x, r); +} + // Original code relied upon unaligned lsh_u64 buffer -inline void load_msg_blk(LSH512_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN]) +inline void load_msg_blk(LSH512_Internal* i_state, const lsh_u8* msgblk) { lsh_u64* submsg_e_l = i_state->submsg_e_l; lsh_u64* submsg_e_r = i_state->submsg_e_r; lsh_u64* submsg_o_l = i_state->submsg_o_l; lsh_u64* submsg_o_r = i_state->submsg_o_r; -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - _mm256_storeu_si256(M256_CAST(submsg_e_l+0), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+0))); - _mm256_storeu_si256(M256_CAST(submsg_e_l+4), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+32))); - - _mm256_storeu_si256(M256_CAST(submsg_e_r+0), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+64))); - _mm256_storeu_si256(M256_CAST(submsg_e_r+4), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+96))); - - _mm256_storeu_si256(M256_CAST(submsg_o_l+0), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+128))); - _mm256_storeu_si256(M256_CAST(submsg_o_l+4), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+160))); - - _mm256_storeu_si256(M256_CAST(submsg_o_r+0), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+192))); - _mm256_storeu_si256(M256_CAST(submsg_o_r+4), - _mm256_loadu_si256(CONST_M256_CAST(msgblk+224))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(submsg_e_l+0), - _mm_loadu_si128(CONST_M128_CAST(msgblk+0))); - _mm_storeu_si128(M128_CAST(submsg_e_l+2), - _mm_loadu_si128(CONST_M128_CAST(msgblk+16))); - _mm_storeu_si128(M128_CAST(submsg_e_l+4), - _mm_loadu_si128(CONST_M128_CAST(msgblk+32))); - _mm_storeu_si128(M128_CAST(submsg_e_l+6), - _mm_loadu_si128(CONST_M128_CAST(msgblk+48))); - - _mm_storeu_si128(M128_CAST(submsg_e_r+0), - _mm_loadu_si128(CONST_M128_CAST(msgblk+64))); - _mm_storeu_si128(M128_CAST(submsg_e_r+2), - _mm_loadu_si128(CONST_M128_CAST(msgblk+80))); - _mm_storeu_si128(M128_CAST(submsg_e_r+4), - _mm_loadu_si128(CONST_M128_CAST(msgblk+96))); - _mm_storeu_si128(M128_CAST(submsg_e_r+6), - _mm_loadu_si128(CONST_M128_CAST(msgblk+112))); - - _mm_storeu_si128(M128_CAST(submsg_o_l+0), - _mm_loadu_si128(CONST_M128_CAST(msgblk+128))); - _mm_storeu_si128(M128_CAST(submsg_o_l+2), - _mm_loadu_si128(CONST_M128_CAST(msgblk+144))); - _mm_storeu_si128(M128_CAST(submsg_o_l+4), - _mm_loadu_si128(CONST_M128_CAST(msgblk+160))); - _mm_storeu_si128(M128_CAST(submsg_o_l+6), - _mm_loadu_si128(CONST_M128_CAST(msgblk+176))); - - _mm_storeu_si128(M128_CAST(submsg_o_r+0), - _mm_loadu_si128(CONST_M128_CAST(msgblk+192))); - _mm_storeu_si128(M128_CAST(submsg_o_r+2), - _mm_loadu_si128(CONST_M128_CAST(msgblk+208))); - _mm_storeu_si128(M128_CAST(submsg_o_r+4), - _mm_loadu_si128(CONST_M128_CAST(msgblk+224))); - _mm_storeu_si128(M128_CAST(submsg_o_r+6), - _mm_loadu_si128(CONST_M128_CAST(msgblk+240))); -#else typedef GetBlock InBlock; InBlock input(msgblk); @@ -379,7 +277,6 @@ inline void load_msg_blk(LSH512_Internal* i_state, const lsh_u8 msgblk[LSH512_MS (submsg_o_l[4])(submsg_o_l[5])(submsg_o_l[6])(submsg_o_l[7]) (submsg_o_r[0])(submsg_o_r[1])(submsg_o_r[2])(submsg_o_r[3]) (submsg_o_r[4])(submsg_o_r[5])(submsg_o_r[6])(submsg_o_r[7]); -#endif } inline void msg_exp_even(LSH512_Internal* i_state) @@ -391,85 +288,6 @@ inline void msg_exp_even(LSH512_Internal* i_state) lsh_u64* submsg_o_l = i_state->submsg_o_l; lsh_u64* submsg_o_r = i_state->submsg_o_r; -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), - _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), _MM_SHUFFLE(1,0,2,3)))); - _mm256_storeu_si256(M256_CAST(submsg_e_l+4), _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)), - _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)), _MM_SHUFFLE(2,1,0,3)))); - _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), - _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), _MM_SHUFFLE(1,0,2,3)))); - _mm256_storeu_si256(M256_CAST(submsg_e_r+4), _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)), - _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)), _MM_SHUFFLE(2,1,0,3)))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - __m128i temp; - _mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), _MM_SHUFFLE(1,0,3,2))); - - temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)); - _mm_storeu_si128(M128_CAST(submsg_e_l+0), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2))); - _mm_storeu_si128(M128_CAST(submsg_e_l+2), temp); - _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), _MM_SHUFFLE(1,0,3,2))); - - temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)); - _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_unpacklo_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); - _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_unpackhi_epi64( - temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); - _mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)), _MM_SHUFFLE(1,0,3,2))); - - temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)); - _mm_storeu_si128(M128_CAST(submsg_e_r+0), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2))); - _mm_storeu_si128(M128_CAST(submsg_e_r+2), temp); - _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), _MM_SHUFFLE(1,0,3,2))); - - temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)); - _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_unpacklo_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); - _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_unpackhi_epi64( - temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); - - _mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)))); - _mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)))); - _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); - _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); - - _mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)))); - _mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)))); - _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); - _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); -#else lsh_u64 temp; temp = submsg_e_l[0]; submsg_e_l[0] = submsg_o_l[0] + submsg_e_l[3]; @@ -491,7 +309,6 @@ inline void msg_exp_even(LSH512_Internal* i_state) submsg_e_r[7] = submsg_o_r[7] + submsg_e_r[6]; submsg_e_r[6] = submsg_o_r[6] + submsg_e_r[5]; submsg_e_r[5] = submsg_o_r[5] + temp; -#endif } inline void msg_exp_odd(LSH512_Internal* i_state) @@ -503,90 +320,6 @@ inline void msg_exp_odd(LSH512_Internal* i_state) lsh_u64* submsg_o_l = i_state->submsg_o_l; lsh_u64* submsg_o_r = i_state->submsg_o_r; -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(submsg_o_l+0), - _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), - _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), _MM_SHUFFLE(1,0,2,3)))); - _mm256_storeu_si256(M256_CAST(submsg_o_l+4), - _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)), - _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)), _MM_SHUFFLE(2,1,0,3)))); - - _mm256_storeu_si256(M256_CAST(submsg_o_r+0), - _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), - _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), _MM_SHUFFLE(1,0,2,3)))); - _mm256_storeu_si256(M256_CAST(submsg_o_r+4), - _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)), - _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)), _MM_SHUFFLE(2,1,0,3)))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - __m128i temp; - _mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), _MM_SHUFFLE(1,0,3,2))); - - temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)); - _mm_storeu_si128(M128_CAST(submsg_o_l+0), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2))); - _mm_storeu_si128(M128_CAST(submsg_o_l+2), temp); - _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), _MM_SHUFFLE(1,0,3,2))); - - temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)); - _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_unpacklo_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); - _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_unpackhi_epi64( - temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); - _mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)), _MM_SHUFFLE(1,0,3,2))); - - temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)); - _mm_storeu_si128(M128_CAST(submsg_o_r+0), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2))); - _mm_storeu_si128(M128_CAST(submsg_o_r+2), temp); - _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), _MM_SHUFFLE(1,0,3,2))); - - temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)); - _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_unpacklo_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); - _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_unpackhi_epi64( - temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); - - _mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)))); - _mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)))); - _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); - _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); - - _mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)))); - _mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)))); - _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); - _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); -#else lsh_u64 temp; temp = submsg_o_l[0]; submsg_o_l[0] = submsg_e_l[0] + submsg_o_l[3]; @@ -608,12 +341,11 @@ inline void msg_exp_odd(LSH512_Internal* i_state) submsg_o_r[7] = submsg_e_r[7] + submsg_o_r[6]; submsg_o_r[6] = submsg_e_r[6] + submsg_o_r[5]; submsg_o_r[5] = submsg_e_r[5] + temp; -#endif } inline void load_sc(const lsh_u64** p_const_v, size_t i) { - *p_const_v = &g_StepConstants[i]; + *p_const_v = &LSH512_StepConstants[i]; } inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_state) @@ -623,46 +355,6 @@ inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_st lsh_u64* submsg_e_l = i_state->submsg_e_l; lsh_u64* submsg_e_r = i_state->submsg_e_r; -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l)))); - _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_r)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r)))); - _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)))); - _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l)))); - _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r)))); - _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)))); - _mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); - _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); - _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); - _mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); -#else cv_l[0] ^= submsg_e_l[0]; cv_l[1] ^= submsg_e_l[1]; cv_l[2] ^= submsg_e_l[2]; cv_l[3] ^= submsg_e_l[3]; cv_l[4] ^= submsg_e_l[4]; cv_l[5] ^= submsg_e_l[5]; @@ -671,7 +363,6 @@ inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_st cv_r[2] ^= submsg_e_r[2]; cv_r[3] ^= submsg_e_r[3]; cv_r[4] ^= submsg_e_r[4]; cv_r[5] ^= submsg_e_r[5]; cv_r[6] ^= submsg_e_r[6]; cv_r[7] ^= submsg_e_r[7]; -#endif } inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_state) @@ -681,46 +372,6 @@ inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_sta lsh_u64* submsg_o_l = i_state->submsg_o_l; lsh_u64* submsg_o_r = i_state->submsg_o_r; -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l)))); - _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_r)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r)))); - _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)))); - _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), - _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l)))); - _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r)))); - _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)))); - _mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); - _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); - _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); - _mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), - _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); -#else cv_l[0] ^= submsg_o_l[0]; cv_l[1] ^= submsg_o_l[1]; cv_l[2] ^= submsg_o_l[2]; cv_l[3] ^= submsg_o_l[3]; cv_l[4] ^= submsg_o_l[4]; cv_l[5] ^= submsg_o_l[5]; @@ -729,33 +380,10 @@ inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_sta cv_r[2] ^= submsg_o_r[2]; cv_r[3] ^= submsg_o_r[3]; cv_r[4] ^= submsg_o_r[4]; cv_r[5] ^= submsg_o_r[5]; cv_r[6] ^= submsg_o_r[6]; cv_r[7] ^= submsg_o_r[7]; -#endif } inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) { -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(cv_l)), - _mm256_loadu_si256(CONST_M256_CAST(cv_r)))); - _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_add_epi64( - _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), - _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_l)), - _mm_loadu_si128(CONST_M128_CAST(cv_r)))); - _mm_storeu_si128(M128_CAST(cv_l+2), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), - _mm_loadu_si128(CONST_M128_CAST(cv_r+2)))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(cv_r+4)))); - _mm_storeu_si128(M128_CAST(cv_l+6), _mm_add_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), - _mm_loadu_si128(CONST_M128_CAST(cv_r+6)))); -#else cv_l[0] += cv_r[0]; cv_l[1] += cv_r[1]; cv_l[2] += cv_r[2]; @@ -764,44 +392,11 @@ inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) cv_l[5] += cv_r[5]; cv_l[6] += cv_r[6]; cv_l[7] += cv_r[7]; -#endif } template inline void rotate_blk(lsh_u64 cv[8]) { -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( - _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), - _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R))); - _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256( - _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R), - _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R))); - -#elif defined(CRYPTOPP_LSH512_XOP_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv), - _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R)); - _mm_storeu_si128(M128_CAST(cv+2), - _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R)); - _mm_storeu_si128(M128_CAST(cv+4), - _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R)); - _mm_storeu_si128(M128_CAST(cv+6), - _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R)); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv), _mm_or_si128( - _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R), - _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), 64-R))); - _mm_storeu_si128(M128_CAST(cv+2), _mm_or_si128( - _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R), - _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), 64-R))); - _mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128( - _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R), - _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 64-R))); - _mm_storeu_si128(M128_CAST(cv+6), _mm_or_si128( - _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R), - _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), 64-R))); -#else cv[0] = rotlConstant(cv[0]); cv[1] = rotlConstant(cv[1]); cv[2] = rotlConstant(cv[2]); @@ -810,33 +405,10 @@ inline void rotate_blk(lsh_u64 cv[8]) cv[5] = rotlConstant(cv[5]); cv[6] = rotlConstant(cv[6]); cv[7] = rotlConstant(cv[7]); -#endif } -inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8]) +inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64* const_v) { -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l)), - _mm256_loadu_si256(CONST_M256_CAST(const_v)))); - _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), - _mm256_loadu_si256(CONST_M256_CAST(const_v+4)))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l)), - _mm_loadu_si128(CONST_M128_CAST(const_v)))); - _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), - _mm_loadu_si128(CONST_M128_CAST(const_v+2)))); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(const_v+4)))); - _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), - _mm_loadu_si128(CONST_M128_CAST(const_v+6)))); -#else cv_l[0] ^= const_v[0]; cv_l[1] ^= const_v[1]; cv_l[2] ^= const_v[2]; @@ -845,66 +417,10 @@ inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8]) cv_l[5] ^= const_v[5]; cv_l[6] ^= const_v[6]; cv_l[7] ^= const_v[7]; -#endif } -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) inline void rotate_msg_gamma(lsh_u64 cv_r[8]) { - // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; - _mm256_storeu_si256(M256_CAST(cv_r+0), - _mm256_shuffle_epi8( - _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), - _mm256_set_epi8( - /* hi lane */ 9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4, - /* lo lane */ 13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0))); - _mm256_storeu_si256(M256_CAST(cv_r+4), - _mm256_shuffle_epi8( - _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), - _mm256_set_epi8( - /* hi lane */ 8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3, - /* lo lane */ 12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7))); -} -#else // CRYPTOPP_LSH512_AVX2_AVAILABLE -# if defined(CRYPTOPP_HAVE_ATTRIBUTE_TARGET) -CRYPTOPP_TARGET_SSSE3 -inline void rotate_msg_gamma(lsh_u64 cv_r[8]) -{ - // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; - _mm_storeu_si128(M128_CAST(cv_r+0), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), - _mm_set_epi8(13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0))); - _mm_storeu_si128(M128_CAST(cv_r+2), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+2)), - _mm_set_epi8(9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4))); - _mm_storeu_si128(M128_CAST(cv_r+4), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_set_epi8(12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7))); - _mm_storeu_si128(M128_CAST(cv_r+6), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+6)), - _mm_set_epi8(8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3))); -} -# endif - -CRYPTOPP_TARGET_DEFAULT -inline void rotate_msg_gamma(lsh_u64 cv_r[8]) -{ -#if defined(CRYPTOPP_LSH512_SSSE3_AVAILABLE) - // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; - _mm_storeu_si128(M128_CAST(cv_r+0), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), - _mm_set_epi8(13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0))); - _mm_storeu_si128(M128_CAST(cv_r+2), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+2)), - _mm_set_epi8(9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4))); - _mm_storeu_si128(M128_CAST(cv_r+4), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_set_epi8(12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7))); - _mm_storeu_si128(M128_CAST(cv_r+6), - _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+6)), - _mm_set_epi8(8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3))); - -#else cv_r[1] = ROTL64(cv_r[1], g_gamma512[1]); cv_r[2] = ROTL64(cv_r[2], g_gamma512[2]); cv_r[3] = ROTL64(cv_r[3], g_gamma512[3]); @@ -912,87 +428,10 @@ inline void rotate_msg_gamma(lsh_u64 cv_r[8]) cv_r[5] = ROTL64(cv_r[5], g_gamma512[5]); cv_r[6] = ROTL64(cv_r[6], g_gamma512[6]); cv_r[7] = ROTL64(cv_r[7], g_gamma512[7]); -#endif } -#endif // CRYPTOPP_LSH512_AVX2_AVAILABLE inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) { -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - __m256i temp[2]; - _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2))); - _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2))); - _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0))); - _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_permute4x64_epi64( - _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0))); - - temp[0] = _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)); - temp[1] = _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)); - - _mm256_storeu_si256(M256_CAST(cv_l+0), - _mm256_loadu_si256(CONST_M256_CAST(cv_l+4))); - _mm256_storeu_si256(M256_CAST(cv_l+4), - _mm256_loadu_si256(CONST_M256_CAST(cv_r+4))); - - _mm256_storeu_si256(M256_CAST(cv_r+0), temp[0]); - _mm256_storeu_si256(M256_CAST(cv_r+4), temp[1]); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - __m128i temp[2]; - temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); - _mm_storeu_si128(M128_CAST(cv_l+0), _mm_unpacklo_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), - _mm_loadu_si128(CONST_M128_CAST(cv_l+0)))); - _mm_storeu_si128(M128_CAST(cv_l+2), _mm_unpackhi_epi64( - temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+2)))); - - temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+4)); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_unpacklo_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), - _mm_loadu_si128(CONST_M128_CAST(cv_l+4)))); - _mm_storeu_si128(M128_CAST(cv_l+6), _mm_unpackhi_epi64( - temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+6)))); - _mm_storeu_si128(M128_CAST(cv_r+2), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), _MM_SHUFFLE(1,0,3,2))); - - temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+0)); - _mm_storeu_si128(M128_CAST(cv_r+0), _mm_unpacklo_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), - _mm_loadu_si128(CONST_M128_CAST(cv_r+2)))); - _mm_storeu_si128(M128_CAST(cv_r+2), _mm_unpackhi_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), temp[0])); - _mm_storeu_si128(M128_CAST(cv_r+6), _mm_shuffle_epi32( - _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), _MM_SHUFFLE(1,0,3,2))); - - temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+4)); - _mm_storeu_si128(M128_CAST(cv_r+4), _mm_unpacklo_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), - _mm_loadu_si128(CONST_M128_CAST(cv_r+6)))); - _mm_storeu_si128(M128_CAST(cv_r+6), _mm_unpackhi_epi64( - _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), temp[0])); - - temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); - temp[1] = _mm_loadu_si128(CONST_M128_CAST(cv_l+2)); - - _mm_storeu_si128(M128_CAST(cv_l+0), - _mm_loadu_si128(CONST_M128_CAST(cv_l+4))); - _mm_storeu_si128(M128_CAST(cv_l+2), - _mm_loadu_si128(CONST_M128_CAST(cv_l+6))); - _mm_storeu_si128(M128_CAST(cv_l+4), - _mm_loadu_si128(CONST_M128_CAST(cv_r+4))); - _mm_storeu_si128(M128_CAST(cv_l+6), - _mm_loadu_si128(CONST_M128_CAST(cv_r+6))); - _mm_storeu_si128(M128_CAST(cv_r+4), - _mm_loadu_si128(CONST_M128_CAST(cv_r+0))); - _mm_storeu_si128(M128_CAST(cv_r+6), - _mm_loadu_si128(CONST_M128_CAST(cv_r+2))); - - _mm_storeu_si128(M128_CAST(cv_r+0), temp[0]); - _mm_storeu_si128(M128_CAST(cv_r+2), temp[1]); -#else lsh_u64 temp; temp = cv_l[0]; cv_l[0] = cv_l[6]; @@ -1012,7 +451,6 @@ inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) cv_l[7] = cv_r[5]; cv_r[5] = cv_r[3]; cv_r[3] = temp; -#endif }; /* -------------------------------------------------------- * @@ -1046,10 +484,6 @@ inline void compress(LSH512_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_B lsh_u64 *cv_l = ctx->cv_l; lsh_u64 *cv_r = ctx->cv_r; -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - load_msg_blk(i_state, pdMsgBlk); msg_add_even(cv_l, cv_r, i_state); @@ -1083,38 +517,8 @@ inline void compress(LSH512_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_B /* -------------------------------------------------------- */ -inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16]) +inline void load_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 iv[16]) { - // The IV's are 32-byte aligned so we can use aligned loads. - -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l+0), - _mm256_load_si256(CONST_M256_CAST(iv+0))); - _mm256_storeu_si256(M256_CAST(cv_l+4), - _mm256_load_si256(CONST_M256_CAST(iv+4))); - _mm256_storeu_si256(M256_CAST(cv_r+0), - _mm256_load_si256(CONST_M256_CAST(iv+8))); - _mm256_storeu_si256(M256_CAST(cv_r+4), - _mm256_load_si256(CONST_M256_CAST(iv+12))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l+0), - _mm_load_si128(CONST_M128_CAST(iv+0))); - _mm_storeu_si128(M128_CAST(cv_l+2), - _mm_load_si128(CONST_M128_CAST(iv+2))); - _mm_storeu_si128(M128_CAST(cv_l+4), - _mm_load_si128(CONST_M128_CAST(iv+4))); - _mm_storeu_si128(M128_CAST(cv_l+6), - _mm_load_si128(CONST_M128_CAST(iv+6))); - _mm_storeu_si128(M128_CAST(cv_r+0), - _mm_load_si128(CONST_M128_CAST(iv+8))); - _mm_storeu_si128(M128_CAST(cv_r+2), - _mm_load_si128(CONST_M128_CAST(iv+10))); - _mm_storeu_si128(M128_CAST(cv_r+4), - _mm_load_si128(CONST_M128_CAST(iv+12))); - _mm_storeu_si128(M128_CAST(cv_r+6), - _mm_load_si128(CONST_M128_CAST(iv+14))); -#else cv_l[0] = iv[0]; cv_l[1] = iv[1]; cv_l[2] = iv[2]; @@ -1131,114 +535,51 @@ inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16]) cv_r[5] = iv[13]; cv_r[6] = iv[14]; cv_r[7] = iv[15]; -#endif } inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) { -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_setzero_si256()); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_l+2), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_l+6), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_r+2), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(cv_r+6), _mm_setzero_si128()); -#else memset(cv_l, 0, 8*sizeof(lsh_u64)); memset(cv_r, 0, 8*sizeof(lsh_u64)); -#endif } inline void zero_submsgs(LSH512_Context* ctx) { lsh_u64* sub_msgs = ctx->sub_msgs; -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - _mm256_storeu_si256(M256_CAST(sub_msgs+ 0), - _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(sub_msgs+ 4), - _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(sub_msgs+ 8), - _mm256_setzero_si256()); - _mm256_storeu_si256(M256_CAST(sub_msgs+12), - _mm256_setzero_si256()); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(sub_msgs+ 0), - _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+ 2), - _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+ 4), - _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+ 6), - _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+ 8), - _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+10), - _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+12), - _mm_setzero_si128()); - _mm_storeu_si128(M128_CAST(sub_msgs+14), - _mm_setzero_si128()); -#else memset(sub_msgs, 0x00, 32*sizeof(lsh_u64)); -#endif } inline void init224(LSH512_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - zero_submsgs(ctx); - load_iv(ctx->cv_l, ctx->cv_r, g_IV224); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224); } inline void init256(LSH512_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - zero_submsgs(ctx); - load_iv(ctx->cv_l, ctx->cv_r, g_IV256); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256); } inline void init384(LSH512_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - zero_submsgs(ctx); - load_iv(ctx->cv_l, ctx->cv_r, g_IV384); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384); } inline void init512(LSH512_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - zero_submsgs(ctx); - load_iv(ctx->cv_l, ctx->cv_r, g_IV512); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512); } /* -------------------------------------------------------- */ @@ -1247,36 +588,9 @@ inline void fin(LSH512_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - AVX_Cleanup cleanup; -#endif - -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)), - _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0)))); - _mm256_storeu_si256(M256_CAST(ctx->cv_l+4), _mm256_xor_si256( - _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+4)), - _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+4)))); - -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - _mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)), - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0)))); - _mm_storeu_si128(M128_CAST(ctx->cv_l+2), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+2)), - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+2)))); - _mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)), - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4)))); - _mm_storeu_si128(M128_CAST(ctx->cv_l+6), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+6)), - _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+6)))); -#else for (size_t i = 0; i < HASH_VAL_MAX_WORD_LEN; i++){ ctx->cv_l[i] = loadLE64(ctx->cv_l[i] ^ ctx->cv_r[i]); } -#endif } /* -------------------------------------------------------- */ @@ -1284,14 +598,14 @@ inline void fin(LSH512_Context* ctx) inline void get_hash(LSH512_Context* ctx, lsh_u8* pbHashVal) { CRYPTOPP_ASSERT(ctx != NULLPTR); - CRYPTOPP_ASSERT(ctx->algtype != 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); CRYPTOPP_ASSERT(pbHashVal != NULLPTR); - lsh_uint algtype = ctx->algtype; - lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(algtype); - lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(algtype); + lsh_uint alg_type = ctx->alg_type; + lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); + lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); - // Multiplying by sizeof(lsh_u8) looks odd... + // Multiplying by looks odd... memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); if (hash_val_bit_len){ pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); @@ -1303,13 +617,13 @@ inline void get_hash(LSH512_Context* ctx, lsh_u8* pbHashVal) lsh_err lsh512_init(LSH512_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); - CRYPTOPP_ASSERT(ctx->algtype != 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); - lsh_u32 algtype = ctx->algtype; + lsh_u32 alg_type = ctx->alg_type; const lsh_u64* const_v = NULL; ctx->remain_databitlen = 0; - switch (algtype){ + switch (alg_type){ case LSH_TYPE_512_512: init512(ctx); return LSH_SUCCESS; @@ -1329,13 +643,9 @@ lsh_err lsh512_init(LSH512_Context* ctx) lsh_u64* cv_l = ctx->cv_l; lsh_u64* cv_r = ctx->cv_r; -#if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - AVX_Cleanup cleanup; -#endif - zero_iv(cv_l, cv_r); cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN; - cv_l[1] = LSH_GET_HASHBIT(algtype); + cv_l[1] = LSH_GET_HASHBIT(alg_type); for (size_t i = 0; i < NUM_STEPS / 2; i++) { @@ -1357,19 +667,20 @@ lsh_err lsh512_update(LSH512_Context* ctx, const lsh_u8* data, size_t databitlen CRYPTOPP_ASSERT(ctx != NULLPTR); CRYPTOPP_ASSERT(data != NULLPTR); CRYPTOPP_ASSERT(databitlen % 8 == 0); - CRYPTOPP_ASSERT(ctx->algtype != 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); if (databitlen == 0){ return LSH_SUCCESS; } + // We are byte oriented. tail bits will always be 0. size_t databytelen = databitlen >> 3; - lsh_uint pos2 = databitlen & 0x7; + // lsh_uint pos2 = databitlen & 0x7; + const size_t pos2 = 0; - // We are byte oriented. remain_msg_bit will always be 0. - lsh_uint remain_msg_byte = ctx->remain_databitlen >> 3; - // remain_msg_bit = ctx->remain_databitlen & 7; - const lsh_uint remain_msg_bit = 0; + size_t remain_msg_byte = static_cast(ctx->remain_databitlen >> 3); + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){ return LSH_ERR_INVALID_STATE; @@ -1389,7 +700,7 @@ lsh_err lsh512_update(LSH512_Context* ctx, const lsh_u8* data, size_t databitlen } if (remain_msg_byte > 0){ - lsh_uint more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte; + size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte; memcpy(ctx->last_block + remain_msg_byte, data, more_byte); compress(ctx, ctx->last_block); data += more_byte; @@ -1425,10 +736,10 @@ lsh_err lsh512_final(LSH512_Context* ctx, lsh_u8* hashval) CRYPTOPP_ASSERT(ctx != NULLPTR); CRYPTOPP_ASSERT(hashval != NULLPTR); - // We are byte oriented. remain_msg_bit will always be 0. - lsh_uint remain_msg_byte = ctx->remain_databitlen >> 3; + // We are byte oriented. tail bits will always be 0. + size_t remain_msg_byte = static_cast(ctx->remain_databitlen >> 3); // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; - const lsh_uint remain_msg_bit = 0; + const size_t remain_msg_bit = 0; if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){ return LSH_ERR_INVALID_STATE; @@ -1454,70 +765,127 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) +#if defined(CRYPTOPP_ENABLE_64BIT_SSE) +# if defined(CRYPTOPP_AVX2_AVAILABLE) + extern void LSH512_Base_Restart_AVX2(word64* state); + extern void LSH512_Base_Update_AVX2(word64* state, const byte *input, size_t size); + extern void LSH512_Base_TruncatedFinal_AVX2(word64* state, byte *hash, size_t size); +# endif +# if defined(CRYPTOPP_SSSE3_AVAILABLE) + extern void LSH512_Base_Restart_SSSE3(word64* state); + extern void LSH512_Base_Update_SSSE3(word64* state, const byte *input, size_t size); + extern void LSH512_Base_TruncatedFinal_SSSE3(word64* state, byte *hash, size_t size); +# endif +#endif + std::string LSH512_Base::AlgorithmProvider() const { -#if defined(CRYPTOPP_LSH512_AVX2_AVAILABLE) - return "AVX2"; -#elif defined(CRYPTOPP_LSH512_AVX_AVAILABLE) - return "AVX"; -#elif defined(CRYPTOPP_LSH512_SSSE3_AVAILABLE) - return "SSSE3"; -#elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - return "SSE2"; -#else - return "C++"; +#if defined(CRYPTOPP_ENABLE_64BIT_SSE) +#if defined(CRYPTOPP_AVX2_AVAILABLE) + if (HasAVX2()) + return "AVX2"; + else #endif +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + if (HasSSSE3()) + return "SSSE3"; + else +#endif +#endif // CRYPTOPP_ENABLE_64BIT_SSE + + return "C++"; } -void LSH512_Base::Restart() +void LSH512_Base_Restart_CXX(word64* state) { - m_remainingBitLength = 0; - - LSH512_Context ctx(m_state, m_algType, m_remainingBitLength); + state[RemainingBits] = 0; + LSH512_Context ctx(state, state[AlgorithmType], state[RemainingBits]); lsh_err err = lsh512_init(&ctx); if (err != LSH_SUCCESS) throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init failed"); } -void LSH512_Base::Update(const byte *input, size_t length) +void LSH512_Base_Update_CXX(word64* state, const byte *input, size_t size) { - CRYPTOPP_ASSERT(input != NULLPTR); - CRYPTOPP_ASSERT(length); - - LSH512_Context ctx(m_state, m_algType, m_remainingBitLength); - lsh_err err = lsh512_update(&ctx, input, 8*length); + LSH512_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh512_update(&ctx, input, 8*size); if (err != LSH_SUCCESS) throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update failed"); } +void LSH512_Base_TruncatedFinal_CXX(word64* state, byte *hash, size_t) +{ + LSH512_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh512_final(&ctx, hash); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final failed"); +} + + +void LSH512_Base::Restart() +{ +#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasAVX2()) + LSH512_Base_Restart_AVX2(m_state); + else +#endif +#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasSSSE3()) + LSH512_Base_Restart_SSSE3(m_state); + else +#endif + + LSH512_Base_Restart_CXX(m_state); +} + +void LSH512_Base::Update(const byte *input, size_t size) +{ + CRYPTOPP_ASSERT(input != NULLPTR); + CRYPTOPP_ASSERT(size); + +#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasAVX2()) + LSH512_Base_Update_AVX2(m_state, input, size); + else +#endif +#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasSSSE3()) + LSH512_Base_Update_SSSE3(m_state, input, size); + else +#endif + + LSH512_Base_Update_CXX(m_state, input, size); +} + void LSH512_Base::TruncatedFinal(byte *hash, size_t size) { CRYPTOPP_ASSERT(hash != NULLPTR); ThrowIfInvalidTruncatedSize(size); - LSH512_Context ctx(m_state, m_algType, m_remainingBitLength); - lsh_err err; + // TODO: determine if LSH512 supports truncated hashes. See the code + // in get_hash(), where a bit-length is added to the last output + // byte of the hash function. + byte fullHash[LSH512_HASH_VAL_MAX_BYTE_LEN]; + bool copyOut = (size < DigestSize()); - if (size >= DigestSize()) - { - err = lsh512_final(&ctx, hash); - } +#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasAVX2()) + LSH512_Base_TruncatedFinal_AVX2(m_state, copyOut ? fullHash : hash, size); else - { - // TODO: determine if LSH512 supports truncated hashes. See the code - // in get_hash(), where a bit-length is added to the last output - // byte of the hash function. - // CRYPTOPP_ASSERT(0); +#endif +#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + if (HasSSSE3()) + LSH512_Base_TruncatedFinal_SSSE3(m_state, copyOut ? fullHash : hash, size); + else +#endif - byte fullHash[HASH_VAL_MAX_WORD_LEN * sizeof(lsh_u64)]; - err = lsh512_final(&ctx, fullHash); + LSH512_Base_TruncatedFinal_CXX(m_state, copyOut ? fullHash : hash, size); + + if (copyOut) memcpy(hash, fullHash, size); - } - - if (err != LSH_SUCCESS) - throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final failed"); Restart(); } diff --git a/lsh512_avx.cpp b/lsh512_avx.cpp new file mode 100644 index 00000000..6e4e8204 --- /dev/null +++ b/lsh512_avx.cpp @@ -0,0 +1,762 @@ +// lsh.cpp - written and placed in the public domain by Jeffrey Walton +// Based on the specification and source code provided by +// Korea Internet & Security Agency (KISA) website. Also +// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do +// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. + +// We are hitting some sort of GCC bug in the LSH AVX2 code path. +// Clang is OK on the AVX2 code path. We believe it is GCC Issue +// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It +// makes using zeroupper a little tricky. + +#include "pch.h" +#include "config.h" + +#include "lsh.h" +#include "misc.h" + +#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +# include +#endif + +#if defined(CRYPTOPP_AVX2_AVAILABLE) +# include +#endif + +#if defined(__GNUC__) && defined(__amd64__) +# include +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +/* LSH Constants */ + +const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256; +// const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048; +// const unsigned int LSH512_CV_BYTE_LEN = 128; +const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64; + +// const unsigned int MSG_BLK_WORD_LEN = 32; +const unsigned int CV_WORD_LEN = 16; +const unsigned int CONST_WORD_LEN = 8; +const unsigned int HASH_VAL_MAX_WORD_LEN = 8; +const unsigned int NUM_STEPS = 28; + +const unsigned int ROT_EVEN_ALPHA = 23; +const unsigned int ROT_EVEN_BETA = 59; +const unsigned int ROT_ODD_ALPHA = 7; +const unsigned int ROT_ODD_BETA = 3; + +const unsigned int LSH_TYPE_512_512 = 0x0010040; +const unsigned int LSH_TYPE_512_384 = 0x0010030; +const unsigned int LSH_TYPE_512_256 = 0x0010020; +const unsigned int LSH_TYPE_512_224 = 0x001001C; + +// const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384; +// const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512; + +/* Error Code */ + +const unsigned int LSH_SUCCESS = 0x0; +// const unsigned int LSH_ERR_NULL_PTR = 0x2401; +// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402; +const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; +const unsigned int LSH_ERR_INVALID_STATE = 0x2404; + +/* Index into our state array */ + +const unsigned int AlgorithmType = 80; +const unsigned int RemainingBits = 81; + +NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(LSH) + +// lsh512.cpp +extern const word64 LSH512_IV224[CV_WORD_LEN]; +extern const word64 LSH512_IV256[CV_WORD_LEN]; +extern const word64 LSH512_IV384[CV_WORD_LEN]; +extern const word64 LSH512_IV512[CV_WORD_LEN]; +extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS]; + +NAMESPACE_END // LSH +NAMESPACE_END // Crypto++ + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::rotlFixed; +using CryptoPP::rotlConstant; + +using CryptoPP::GetBlock; +using CryptoPP::LittleEndian; +using CryptoPP::ConditionalByteReverse; +using CryptoPP::LITTLE_ENDIAN_ORDER; + +using CryptoPP::LSH::LSH512_IV224; +using CryptoPP::LSH::LSH512_IV256; +using CryptoPP::LSH::LSH512_IV384; +using CryptoPP::LSH::LSH512_IV512; +using CryptoPP::LSH::LSH512_StepConstants; + +typedef byte lsh_u8; +typedef word32 lsh_u32; +typedef word64 lsh_u64; +typedef word32 lsh_uint; +typedef word32 lsh_err; +typedef word32 lsh_type; + +struct LSH512_AVX2_Context +{ + LSH512_AVX2_Context(word64* state, word64 algType, word64& remainingBitLength) : + cv_l(state+0), cv_r(state+8), sub_msgs(state+16), + last_block(reinterpret_cast(state+48)), + remain_databitlen(remainingBitLength), + alg_type(static_cast(algType)) {} + + lsh_u64* cv_l; // start of our state block + lsh_u64* cv_r; + lsh_u64* sub_msgs; + lsh_u8* last_block; + lsh_u64& remain_databitlen; + lsh_type alg_type; +}; + +struct LSH512_AVX2_Internal +{ + LSH512_AVX2_Internal(word64* state) : + submsg_e_l(state+16), submsg_e_r(state+24), + submsg_o_l(state+32), submsg_o_r(state+40) { } + + lsh_u64* submsg_e_l; /* even left sub-message */ + lsh_u64* submsg_e_r; /* even right sub-message */ + lsh_u64* submsg_o_l; /* odd left sub-message */ + lsh_u64* submsg_o_r; /* odd right sub-message */ +}; + +// Zero the upper 128 bits of all YMM registers on exit. +// It avoids AVX state transition penalties when saving state. +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735 +// makes using zeroupper a little tricky. + +struct AVX_Cleanup +{ + ~AVX_Cleanup() { + _mm256_zeroupper(); + } +}; + +// const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; + +/* LSH AlgType Macro */ + +inline bool LSH_IS_LSH512(lsh_uint val) { + return (val & 0xf0000) == 0x10000; +} + +inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { + return val >> 24; +} + +inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { + return val & 0xffff; +} + +inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { + return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); +} + +inline lsh_u64 loadLE64(lsh_u64 v) { + return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); +} + +lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) { + return rotlFixed(x, r); +} + +// Original code relied upon unaligned lsh_u64 buffer +inline void load_msg_blk(LSH512_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN]) +{ + lsh_u64* submsg_e_l = i_state->submsg_e_l; + lsh_u64* submsg_e_r = i_state->submsg_e_r; + lsh_u64* submsg_o_l = i_state->submsg_o_l; + lsh_u64* submsg_o_r = i_state->submsg_o_r; + + _mm256_storeu_si256(M256_CAST(submsg_e_l+0), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+0))); + _mm256_storeu_si256(M256_CAST(submsg_e_l+4), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+32))); + + _mm256_storeu_si256(M256_CAST(submsg_e_r+0), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+64))); + _mm256_storeu_si256(M256_CAST(submsg_e_r+4), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+96))); + + _mm256_storeu_si256(M256_CAST(submsg_o_l+0), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+128))); + _mm256_storeu_si256(M256_CAST(submsg_o_l+4), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+160))); + + _mm256_storeu_si256(M256_CAST(submsg_o_r+0), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+192))); + _mm256_storeu_si256(M256_CAST(submsg_o_r+4), + _mm256_loadu_si256(CONST_M256_CAST(msgblk+224))); +} + +inline void msg_exp_even(LSH512_AVX2_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u64* submsg_e_l = i_state->submsg_e_l; + lsh_u64* submsg_e_r = i_state->submsg_e_r; + lsh_u64* submsg_o_l = i_state->submsg_o_l; + lsh_u64* submsg_o_r = i_state->submsg_o_r; + + _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), + _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), + _MM_SHUFFLE(1,0,2,3)))); + _mm256_storeu_si256(M256_CAST(submsg_e_l+4), _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)), + _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)), + _MM_SHUFFLE(2,1,0,3)))); + + _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), + _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), + _MM_SHUFFLE(1,0,2,3)))); + _mm256_storeu_si256(M256_CAST(submsg_e_r+4), _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)), + _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)), + _MM_SHUFFLE(2,1,0,3)))); +} + +inline void msg_exp_odd(LSH512_AVX2_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u64* submsg_e_l = i_state->submsg_e_l; + lsh_u64* submsg_e_r = i_state->submsg_e_r; + lsh_u64* submsg_o_l = i_state->submsg_o_l; + lsh_u64* submsg_o_r = i_state->submsg_o_r; + + _mm256_storeu_si256(M256_CAST(submsg_o_l+0), + _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), + _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), + _MM_SHUFFLE(1,0,2,3)))); + _mm256_storeu_si256(M256_CAST(submsg_o_l+4), + _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)), + _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)), + _MM_SHUFFLE(2,1,0,3)))); + + _mm256_storeu_si256(M256_CAST(submsg_o_r+0), + _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), + _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), + _MM_SHUFFLE(1,0,2,3)))); + _mm256_storeu_si256(M256_CAST(submsg_o_r+4), + _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)), + _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)), + _MM_SHUFFLE(2,1,0,3)))); +} + +inline void load_sc(const lsh_u64** p_const_v, size_t i) +{ + *p_const_v = &LSH512_StepConstants[i]; +} + +inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u64* submsg_e_l = i_state->submsg_e_l; + lsh_u64* submsg_e_r = i_state->submsg_e_r; + + _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l)))); + _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_r)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r)))); + + _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)))); + _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)))); +} + +inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u64* submsg_o_l = i_state->submsg_o_l; + lsh_u64* submsg_o_r = i_state->submsg_o_r; + + _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l)))); + _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_r)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r)))); + + _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)))); + _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), + _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)))); +} + +inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) +{ + _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(cv_l)), + _mm256_loadu_si256(CONST_M256_CAST(cv_r)))); + _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_add_epi64( + _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), + _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)))); +} + +template +inline void rotate_blk(lsh_u64 cv[8]) +{ + _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( + _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), + _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R))); + _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256( + _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R), + _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R))); +} + +inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8]) +{ + _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l)), + _mm256_loadu_si256(CONST_M256_CAST(const_v)))); + _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), + _mm256_loadu_si256(CONST_M256_CAST(const_v+4)))); +} + +inline void rotate_msg_gamma(lsh_u64 cv_r[8]) +{ + // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; + _mm256_storeu_si256(M256_CAST(cv_r+0), + _mm256_shuffle_epi8( + _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), + _mm256_set_epi8( + /* hi lane */ 9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4, + /* lo lane */ 13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0))); + _mm256_storeu_si256(M256_CAST(cv_r+4), + _mm256_shuffle_epi8( + _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), + _mm256_set_epi8( + /* hi lane */ 8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3, + /* lo lane */ 12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7))); +} + +inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) +{ + __m256i temp[2]; + _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2))); + _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2))); + _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0))); + _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_permute4x64_epi64( + _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0))); + + temp[0] = _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)); + temp[1] = _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)); + + _mm256_storeu_si256(M256_CAST(cv_l+0), + _mm256_loadu_si256(CONST_M256_CAST(cv_l+4))); + _mm256_storeu_si256(M256_CAST(cv_l+4), + _mm256_loadu_si256(CONST_M256_CAST(cv_r+4))); + + _mm256_storeu_si256(M256_CAST(cv_r+0), temp[0]); + _mm256_storeu_si256(M256_CAST(cv_r+4), temp[1]); +}; + +/* -------------------------------------------------------- * +* step function +* -------------------------------------------------------- */ + +template +inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8]) +{ + add_blk(cv_l, cv_r); + rotate_blk(cv_l); + xor_with_const(cv_l, const_v); + add_blk(cv_r, cv_l); + rotate_blk(cv_r); + add_blk(cv_l, cv_r); + rotate_msg_gamma(cv_r); +} + +/* -------------------------------------------------------- * +* compression function +* -------------------------------------------------------- */ + +inline void compress(LSH512_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN]) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + LSH512_AVX2_Internal s_state(ctx->cv_l); + LSH512_AVX2_Internal* i_state = &s_state; + + const lsh_u64* const_v = NULL; + lsh_u64 *cv_l = ctx->cv_l; + lsh_u64 *cv_r = ctx->cv_r; + + load_msg_blk(i_state, pdMsgBlk); + + msg_add_even(cv_l, cv_r, i_state); + load_sc(&const_v, 0); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + msg_add_odd(cv_l, cv_r, i_state); + load_sc(&const_v, 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + for (size_t i = 1; i < NUM_STEPS / 2; i++) + { + msg_exp_even(i_state); + msg_add_even(cv_l, cv_r, i_state); + load_sc(&const_v, 16 * i); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + msg_exp_odd(i_state); + msg_add_odd(cv_l, cv_r, i_state); + load_sc(&const_v, 16 * i + 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + } + + msg_exp_even(i_state); + msg_add_even(cv_l, cv_r, i_state); +} + +/* -------------------------------------------------------- */ + +inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16]) +{ + // The IV's are 32-byte aligned so we can use aligned loads. + _mm256_storeu_si256(M256_CAST(cv_l+0), + _mm256_load_si256(CONST_M256_CAST(iv+0))); + _mm256_storeu_si256(M256_CAST(cv_l+4), + _mm256_load_si256(CONST_M256_CAST(iv+4))); + + _mm256_storeu_si256(M256_CAST(cv_r+0), + _mm256_load_si256(CONST_M256_CAST(iv+8))); + _mm256_storeu_si256(M256_CAST(cv_r+4), + _mm256_load_si256(CONST_M256_CAST(iv+12))); +} + +inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) +{ + _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_setzero_si256()); +} + +inline void zero_submsgs(LSH512_AVX2_Context* ctx) +{ + lsh_u64* sub_msgs = ctx->sub_msgs; + + _mm256_storeu_si256(M256_CAST(sub_msgs+ 0), + _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(sub_msgs+ 4), + _mm256_setzero_si256()); + + _mm256_storeu_si256(M256_CAST(sub_msgs+ 8), + _mm256_setzero_si256()); + _mm256_storeu_si256(M256_CAST(sub_msgs+12), + _mm256_setzero_si256()); +} + +inline void init224(LSH512_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224); +} + +inline void init256(LSH512_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256); +} + +inline void init384(LSH512_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384); +} + +inline void init512(LSH512_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512); +} + +/* -------------------------------------------------------- */ + +inline void fin(LSH512_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)), + _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0)))); + + _mm256_storeu_si256(M256_CAST(ctx->cv_l+4), _mm256_xor_si256( + _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+4)), + _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+4)))); +} + +/* -------------------------------------------------------- */ + +inline void get_hash(LSH512_AVX2_Context* ctx, lsh_u8* pbHashVal) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + CRYPTOPP_ASSERT(pbHashVal != NULLPTR); + + lsh_uint alg_type = ctx->alg_type; + lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); + lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); + + // Multiplying by sizeof(lsh_u8) looks odd... + memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); + if (hash_val_bit_len){ + pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); + } +} + +/* -------------------------------------------------------- */ + +lsh_err lsh512_init_avx2(LSH512_AVX2_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + + lsh_u32 alg_type = ctx->alg_type; + const lsh_u64* const_v = NULL; + ctx->remain_databitlen = 0; + + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. + AVX_Cleanup cleanup; + + switch (alg_type){ + case LSH_TYPE_512_512: + init512(ctx); + return LSH_SUCCESS; + case LSH_TYPE_512_384: + init384(ctx); + return LSH_SUCCESS; + case LSH_TYPE_512_256: + init256(ctx); + return LSH_SUCCESS; + case LSH_TYPE_512_224: + init224(ctx); + return LSH_SUCCESS; + default: + break; + } + + lsh_u64* cv_l = ctx->cv_l; + lsh_u64* cv_r = ctx->cv_r; + + zero_iv(cv_l, cv_r); + cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN; + cv_l[1] = LSH_GET_HASHBIT(alg_type); + + for (size_t i = 0; i < NUM_STEPS / 2; i++) + { + //Mix + load_sc(&const_v, i * 16); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + load_sc(&const_v, i * 16 + 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + } + + return LSH_SUCCESS; +} + +lsh_err lsh512_update_avx2(LSH512_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(data != NULLPTR); + CRYPTOPP_ASSERT(databitlen % 8 == 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. + AVX_Cleanup cleanup; + + if (databitlen == 0){ + return LSH_SUCCESS; + } + + // We are byte oriented. tail bits will always be 0. + size_t databytelen = databitlen >> 3; + // lsh_uint pos2 = databitlen & 0x7; + const size_t pos2 = 0; + + size_t remain_msg_byte = static_cast(ctx->remain_databitlen >> 3); + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; + + if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){ + return LSH_ERR_INVALID_STATE; + } + if (remain_msg_bit > 0){ + return LSH_ERR_INVALID_DATABITLEN; + } + + if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){ + memcpy(ctx->last_block + remain_msg_byte, data, databytelen); + ctx->remain_databitlen += (lsh_uint)databitlen; + remain_msg_byte += (lsh_uint)databytelen; + if (pos2){ + ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff); + } + return LSH_SUCCESS; + } + + if (remain_msg_byte > 0){ + size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte; + memcpy(ctx->last_block + remain_msg_byte, data, more_byte); + compress(ctx, ctx->last_block); + data += more_byte; + databytelen -= more_byte; + remain_msg_byte = 0; + ctx->remain_databitlen = 0; + } + + while (databytelen >= LSH512_MSG_BLK_BYTE_LEN) + { + // This call to compress caused some trouble. + // The data pointer can become unaligned in the + // previous block. + compress(ctx, data); + data += LSH512_MSG_BLK_BYTE_LEN; + databytelen -= LSH512_MSG_BLK_BYTE_LEN; + } + + if (databytelen > 0){ + memcpy(ctx->last_block, data, databytelen); + ctx->remain_databitlen = (lsh_uint)(databytelen << 3); + } + + if (pos2){ + ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff); + ctx->remain_databitlen += pos2; + } + return LSH_SUCCESS; +} + +lsh_err lsh512_final_avx2(LSH512_AVX2_Context* ctx, lsh_u8* hashval) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(hashval != NULLPTR); + + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. + AVX_Cleanup cleanup; + + // We are byte oriented. tail bits will always be 0. + size_t remain_msg_byte = static_cast(ctx->remain_databitlen >> 3); + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; + + if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){ + return LSH_ERR_INVALID_STATE; + } + + if (remain_msg_bit){ + ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit)); + } + else{ + ctx->last_block[remain_msg_byte] = 0x80; + } + memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1); + + compress(ctx, ctx->last_block); + + fin(ctx); + get_hash(ctx, hashval); + + return LSH_SUCCESS; +} + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +extern +void LSH512_Base_Restart_AVX2(word64* state) +{ + state[RemainingBits] = 0; + LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh512_init_avx2(&ctx); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_avx2 failed"); +} + +extern +void LSH512_Base_Update_AVX2(word64* state, const byte *input, size_t size) +{ + LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh512_update_avx2(&ctx, input, 8*size); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_avx2 failed"); +} + +extern +void LSH512_Base_TruncatedFinal_AVX2(word64* state, byte *hash, size_t) +{ + LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh512_final_avx2(&ctx, hash); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_avx2 failed"); +} + +NAMESPACE_END + +#endif // CRYPTOPP_AVX2_AVAILABLE diff --git a/lsh512_sse.cpp b/lsh512_sse.cpp new file mode 100644 index 00000000..8fb89ef7 --- /dev/null +++ b/lsh512_sse.cpp @@ -0,0 +1,937 @@ +// lsh.cpp - written and placed in the public domain by Jeffrey Walton +// Based on the specification and source code provided by +// Korea Internet & Security Agency (KISA) website. Also +// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do +// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. + +// We are hitting some sort of GCC bug in the LSH AVX2 code path. +// Clang is OK on the AVX2 code path. We believe it is GCC Issue +// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It +// makes using zeroupper a little tricky. + +#include "pch.h" +#include "config.h" + +#include "lsh.h" +#include "misc.h" + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +# include +# include +#endif + +#if defined(CRYPTOPP_XOP_AVAILABLE) +# include +#endif + +#if defined(__GNUC__) && defined(__amd64__) +# include +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +/* LSH Constants */ + +const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256; +// const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048; +// const unsigned int LSH512_CV_BYTE_LEN = 128; +const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64; + +// const unsigned int MSG_BLK_WORD_LEN = 32; +const unsigned int CV_WORD_LEN = 16; +const unsigned int CONST_WORD_LEN = 8; +const unsigned int HASH_VAL_MAX_WORD_LEN = 8; +const unsigned int NUM_STEPS = 28; + +const unsigned int ROT_EVEN_ALPHA = 23; +const unsigned int ROT_EVEN_BETA = 59; +const unsigned int ROT_ODD_ALPHA = 7; +const unsigned int ROT_ODD_BETA = 3; + +const unsigned int LSH_TYPE_512_512 = 0x0010040; +const unsigned int LSH_TYPE_512_384 = 0x0010030; +const unsigned int LSH_TYPE_512_256 = 0x0010020; +const unsigned int LSH_TYPE_512_224 = 0x001001C; + +// const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384; +// const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512; + +/* Error Code */ + +const unsigned int LSH_SUCCESS = 0x0; +// const unsigned int LSH_ERR_NULL_PTR = 0x2401; +// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402; +const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; +const unsigned int LSH_ERR_INVALID_STATE = 0x2404; + +/* Index into our state array */ + +const unsigned int AlgorithmType = 80; +const unsigned int RemainingBits = 81; + +NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(LSH) + +// lsh512.cpp +extern const word64 LSH512_IV224[CV_WORD_LEN]; +extern const word64 LSH512_IV256[CV_WORD_LEN]; +extern const word64 LSH512_IV384[CV_WORD_LEN]; +extern const word64 LSH512_IV512[CV_WORD_LEN]; +extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS]; + +NAMESPACE_END // LSH +NAMESPACE_END // Crypto++ + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::rotlFixed; +using CryptoPP::rotlConstant; + +using CryptoPP::GetBlock; +using CryptoPP::LittleEndian; +using CryptoPP::ConditionalByteReverse; +using CryptoPP::LITTLE_ENDIAN_ORDER; + +using CryptoPP::LSH::LSH512_IV224; +using CryptoPP::LSH::LSH512_IV256; +using CryptoPP::LSH::LSH512_IV384; +using CryptoPP::LSH::LSH512_IV512; +using CryptoPP::LSH::LSH512_StepConstants; + +typedef byte lsh_u8; +typedef word32 lsh_u32; +typedef word64 lsh_u64; +typedef word32 lsh_uint; +typedef word32 lsh_err; +typedef word32 lsh_type; + +struct LSH512_SSSE3_Context +{ + LSH512_SSSE3_Context(word64* state, word64 algType, word64& remainingBitLength) : + cv_l(state+0), cv_r(state+8), sub_msgs(state+16), + last_block(reinterpret_cast(state+48)), + remain_databitlen(remainingBitLength), + alg_type(static_cast(algType)) {} + + lsh_u64* cv_l; // start of our state block + lsh_u64* cv_r; + lsh_u64* sub_msgs; + lsh_u8* last_block; + lsh_u64& remain_databitlen; + lsh_type alg_type; +}; + +struct LSH512_SSSE3_Internal +{ + LSH512_SSSE3_Internal(word64* state) : + submsg_e_l(state+16), submsg_e_r(state+24), + submsg_o_l(state+32), submsg_o_r(state+40) { } + + lsh_u64* submsg_e_l; /* even left sub-message */ + lsh_u64* submsg_e_r; /* even right sub-message */ + lsh_u64* submsg_o_l; /* odd left sub-message */ + lsh_u64* submsg_o_r; /* odd right sub-message */ +}; + +const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; + +/* LSH AlgType Macro */ + +inline bool LSH_IS_LSH512(lsh_uint val) { + return (val & 0xf0000) == 0x10000; +} + +inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { + return val >> 24; +} + +inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { + return val & 0xffff; +} + +inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { + return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); +} + +inline lsh_u64 loadLE64(lsh_u64 v) { + return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); +} + +lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) { + return rotlFixed(x, r); +} + +// Original code relied upon unaligned lsh_u64 buffer +inline void load_msg_blk(LSH512_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN]) +{ + lsh_u64* submsg_e_l = i_state->submsg_e_l; + lsh_u64* submsg_e_r = i_state->submsg_e_r; + lsh_u64* submsg_o_l = i_state->submsg_o_l; + lsh_u64* submsg_o_r = i_state->submsg_o_r; + + _mm_storeu_si128(M128_CAST(submsg_e_l+0), + _mm_loadu_si128(CONST_M128_CAST(msgblk+0))); + _mm_storeu_si128(M128_CAST(submsg_e_l+2), + _mm_loadu_si128(CONST_M128_CAST(msgblk+16))); + _mm_storeu_si128(M128_CAST(submsg_e_l+4), + _mm_loadu_si128(CONST_M128_CAST(msgblk+32))); + _mm_storeu_si128(M128_CAST(submsg_e_l+6), + _mm_loadu_si128(CONST_M128_CAST(msgblk+48))); + + _mm_storeu_si128(M128_CAST(submsg_e_r+0), + _mm_loadu_si128(CONST_M128_CAST(msgblk+64))); + _mm_storeu_si128(M128_CAST(submsg_e_r+2), + _mm_loadu_si128(CONST_M128_CAST(msgblk+80))); + _mm_storeu_si128(M128_CAST(submsg_e_r+4), + _mm_loadu_si128(CONST_M128_CAST(msgblk+96))); + _mm_storeu_si128(M128_CAST(submsg_e_r+6), + _mm_loadu_si128(CONST_M128_CAST(msgblk+112))); + + _mm_storeu_si128(M128_CAST(submsg_o_l+0), + _mm_loadu_si128(CONST_M128_CAST(msgblk+128))); + _mm_storeu_si128(M128_CAST(submsg_o_l+2), + _mm_loadu_si128(CONST_M128_CAST(msgblk+144))); + _mm_storeu_si128(M128_CAST(submsg_o_l+4), + _mm_loadu_si128(CONST_M128_CAST(msgblk+160))); + _mm_storeu_si128(M128_CAST(submsg_o_l+6), + _mm_loadu_si128(CONST_M128_CAST(msgblk+176))); + + _mm_storeu_si128(M128_CAST(submsg_o_r+0), + _mm_loadu_si128(CONST_M128_CAST(msgblk+192))); + _mm_storeu_si128(M128_CAST(submsg_o_r+2), + _mm_loadu_si128(CONST_M128_CAST(msgblk+208))); + _mm_storeu_si128(M128_CAST(submsg_o_r+4), + _mm_loadu_si128(CONST_M128_CAST(msgblk+224))); + _mm_storeu_si128(M128_CAST(submsg_o_r+6), + _mm_loadu_si128(CONST_M128_CAST(msgblk+240))); +} + +inline void msg_exp_even(LSH512_SSSE3_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u64* submsg_e_l = i_state->submsg_e_l; + lsh_u64* submsg_e_r = i_state->submsg_e_r; + lsh_u64* submsg_o_l = i_state->submsg_o_l; + lsh_u64* submsg_o_r = i_state->submsg_o_r; + + __m128i temp; + _mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), _MM_SHUFFLE(1,0,3,2))); + + temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)); + _mm_storeu_si128(M128_CAST(submsg_e_l+0), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2))); + _mm_storeu_si128(M128_CAST(submsg_e_l+2), temp); + _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), _MM_SHUFFLE(1,0,3,2))); + + temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)); + _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_unpacklo_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); + _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_unpackhi_epi64( + temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); + _mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)), _MM_SHUFFLE(1,0,3,2))); + + temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)); + _mm_storeu_si128(M128_CAST(submsg_e_r+0), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2))); + _mm_storeu_si128(M128_CAST(submsg_e_r+2), temp); + _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), _MM_SHUFFLE(1,0,3,2))); + + temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)); + _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_unpacklo_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); + _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_unpackhi_epi64( + temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); + + _mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)))); + _mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)))); + _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); + _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); + + _mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)))); + _mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)))); + _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); + _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); +} + +inline void msg_exp_odd(LSH512_SSSE3_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u64* submsg_e_l = i_state->submsg_e_l; + lsh_u64* submsg_e_r = i_state->submsg_e_r; + lsh_u64* submsg_o_l = i_state->submsg_o_l; + lsh_u64* submsg_o_r = i_state->submsg_o_r; + + __m128i temp; + _mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), _MM_SHUFFLE(1,0,3,2))); + + temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)); + _mm_storeu_si128(M128_CAST(submsg_o_l+0), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2))); + _mm_storeu_si128(M128_CAST(submsg_o_l+2), temp); + _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), _MM_SHUFFLE(1,0,3,2))); + + temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)); + _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_unpacklo_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); + _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_unpackhi_epi64( + temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); + _mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)), _MM_SHUFFLE(1,0,3,2))); + + temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)); + _mm_storeu_si128(M128_CAST(submsg_o_r+0), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2))); + _mm_storeu_si128(M128_CAST(submsg_o_r+2), temp); + _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), _MM_SHUFFLE(1,0,3,2))); + + temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)); + _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_unpacklo_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); + _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_unpackhi_epi64( + temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); + + _mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)))); + _mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)))); + _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); + _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); + + _mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)))); + _mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)))); + _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); + _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); +} + +inline void load_sc(const lsh_u64** p_const_v, size_t i) +{ + *p_const_v = &LSH512_StepConstants[i]; +} + +inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_SSSE3_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u64* submsg_e_l = i_state->submsg_e_l; + lsh_u64* submsg_e_r = i_state->submsg_e_r; + + _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l)))); + _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r)))); + _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)))); + _mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); + _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); + _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); + _mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); +} + +inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_SSSE3_Internal* i_state) +{ + CRYPTOPP_ASSERT(i_state != NULLPTR); + + lsh_u64* submsg_o_l = i_state->submsg_o_l; + lsh_u64* submsg_o_r = i_state->submsg_o_r; + + _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l)))); + _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r)))); + _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)))); + _mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); + _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); + _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); + _mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), + _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); +} + +inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) +{ + _mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_l)), + _mm_loadu_si128(CONST_M128_CAST(cv_r)))); + _mm_storeu_si128(M128_CAST(cv_l+2), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), + _mm_loadu_si128(CONST_M128_CAST(cv_r+2)))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(cv_r+4)))); + _mm_storeu_si128(M128_CAST(cv_l+6), _mm_add_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), + _mm_loadu_si128(CONST_M128_CAST(cv_r+6)))); +} + +template +inline void rotate_blk(lsh_u64 cv[8]) +{ +#if defined(CRYPTOPP_XOP_AVAILABLE) + _mm_storeu_si128(M128_CAST(cv), + _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R)); + _mm_storeu_si128(M128_CAST(cv+2), + _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R)); + _mm_storeu_si128(M128_CAST(cv+4), + _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R)); + _mm_storeu_si128(M128_CAST(cv+6), + _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R)); + +#else + _mm_storeu_si128(M128_CAST(cv), _mm_or_si128( + _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R), + _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), 64-R))); + _mm_storeu_si128(M128_CAST(cv+2), _mm_or_si128( + _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R), + _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), 64-R))); + _mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128( + _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R), + _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 64-R))); + _mm_storeu_si128(M128_CAST(cv+6), _mm_or_si128( + _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R), + _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), 64-R))); +#endif +} + +inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8]) +{ + _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l)), + _mm_loadu_si128(CONST_M128_CAST(const_v)))); + _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), + _mm_loadu_si128(CONST_M128_CAST(const_v+2)))); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(const_v+4)))); + _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), + _mm_loadu_si128(CONST_M128_CAST(const_v+6)))); +} + +inline void rotate_msg_gamma(lsh_u64 cv_r[8]) +{ + // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; + _mm_storeu_si128(M128_CAST(cv_r+0), + _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), + _mm_set_epi8(13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0))); + _mm_storeu_si128(M128_CAST(cv_r+2), + _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+2)), + _mm_set_epi8(9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4))); + + _mm_storeu_si128(M128_CAST(cv_r+4), + _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), + _mm_set_epi8(12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7))); + _mm_storeu_si128(M128_CAST(cv_r+6), + _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+6)), + _mm_set_epi8(8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3))); +} + +inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) +{ + __m128i temp[2]; + temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); + _mm_storeu_si128(M128_CAST(cv_l+0), _mm_unpacklo_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), + _mm_loadu_si128(CONST_M128_CAST(cv_l+0)))); + _mm_storeu_si128(M128_CAST(cv_l+2), _mm_unpackhi_epi64( + temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+2)))); + + temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+4)); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_unpacklo_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), + _mm_loadu_si128(CONST_M128_CAST(cv_l+4)))); + _mm_storeu_si128(M128_CAST(cv_l+6), _mm_unpackhi_epi64( + temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+6)))); + _mm_storeu_si128(M128_CAST(cv_r+2), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), _MM_SHUFFLE(1,0,3,2))); + + temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+0)); + _mm_storeu_si128(M128_CAST(cv_r+0), _mm_unpacklo_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), + _mm_loadu_si128(CONST_M128_CAST(cv_r+2)))); + _mm_storeu_si128(M128_CAST(cv_r+2), _mm_unpackhi_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), temp[0])); + _mm_storeu_si128(M128_CAST(cv_r+6), _mm_shuffle_epi32( + _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), _MM_SHUFFLE(1,0,3,2))); + + temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+4)); + _mm_storeu_si128(M128_CAST(cv_r+4), _mm_unpacklo_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), + _mm_loadu_si128(CONST_M128_CAST(cv_r+6)))); + _mm_storeu_si128(M128_CAST(cv_r+6), _mm_unpackhi_epi64( + _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), temp[0])); + + temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); + temp[1] = _mm_loadu_si128(CONST_M128_CAST(cv_l+2)); + + _mm_storeu_si128(M128_CAST(cv_l+0), + _mm_loadu_si128(CONST_M128_CAST(cv_l+4))); + _mm_storeu_si128(M128_CAST(cv_l+2), + _mm_loadu_si128(CONST_M128_CAST(cv_l+6))); + _mm_storeu_si128(M128_CAST(cv_l+4), + _mm_loadu_si128(CONST_M128_CAST(cv_r+4))); + _mm_storeu_si128(M128_CAST(cv_l+6), + _mm_loadu_si128(CONST_M128_CAST(cv_r+6))); + _mm_storeu_si128(M128_CAST(cv_r+4), + _mm_loadu_si128(CONST_M128_CAST(cv_r+0))); + _mm_storeu_si128(M128_CAST(cv_r+6), + _mm_loadu_si128(CONST_M128_CAST(cv_r+2))); + + _mm_storeu_si128(M128_CAST(cv_r+0), temp[0]); + _mm_storeu_si128(M128_CAST(cv_r+2), temp[1]); +}; + +/* -------------------------------------------------------- * +* step function +* -------------------------------------------------------- */ + +template +inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8]) +{ + add_blk(cv_l, cv_r); + rotate_blk(cv_l); + xor_with_const(cv_l, const_v); + add_blk(cv_r, cv_l); + rotate_blk(cv_r); + add_blk(cv_l, cv_r); + rotate_msg_gamma(cv_r); +} + +/* -------------------------------------------------------- * +* compression function +* -------------------------------------------------------- */ + +inline void compress(LSH512_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN]) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + LSH512_SSSE3_Internal s_state(ctx->cv_l); + LSH512_SSSE3_Internal* i_state = &s_state; + + const lsh_u64* const_v = NULL; + lsh_u64 *cv_l = ctx->cv_l; + lsh_u64 *cv_r = ctx->cv_r; + + load_msg_blk(i_state, pdMsgBlk); + + msg_add_even(cv_l, cv_r, i_state); + load_sc(&const_v, 0); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + msg_add_odd(cv_l, cv_r, i_state); + load_sc(&const_v, 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + for (size_t i = 1; i < NUM_STEPS / 2; i++) + { + msg_exp_even(i_state); + msg_add_even(cv_l, cv_r, i_state); + load_sc(&const_v, 16 * i); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + msg_exp_odd(i_state); + msg_add_odd(cv_l, cv_r, i_state); + load_sc(&const_v, 16 * i + 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + } + + msg_exp_even(i_state); + msg_add_even(cv_l, cv_r, i_state); +} + +/* -------------------------------------------------------- */ + +inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16]) +{ + // The IV's are 32-byte aligned so we can use aligned loads. + _mm_storeu_si128(M128_CAST(cv_l+0), + _mm_load_si128(CONST_M128_CAST(iv+0))); + _mm_storeu_si128(M128_CAST(cv_l+2), + _mm_load_si128(CONST_M128_CAST(iv+2))); + _mm_storeu_si128(M128_CAST(cv_l+4), + _mm_load_si128(CONST_M128_CAST(iv+4))); + _mm_storeu_si128(M128_CAST(cv_l+6), + _mm_load_si128(CONST_M128_CAST(iv+6))); + _mm_storeu_si128(M128_CAST(cv_r+0), + _mm_load_si128(CONST_M128_CAST(iv+8))); + _mm_storeu_si128(M128_CAST(cv_r+2), + _mm_load_si128(CONST_M128_CAST(iv+10))); + _mm_storeu_si128(M128_CAST(cv_r+4), + _mm_load_si128(CONST_M128_CAST(iv+12))); + _mm_storeu_si128(M128_CAST(cv_r+6), + _mm_load_si128(CONST_M128_CAST(iv+14))); +} + +inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) +{ + _mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_l+2), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_l+6), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_r+2), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(cv_r+6), _mm_setzero_si128()); +} + +inline void zero_submsgs(LSH512_SSSE3_Context* ctx) +{ + lsh_u64* sub_msgs = ctx->sub_msgs; + + _mm_storeu_si128(M128_CAST(sub_msgs+ 0), + _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+ 2), + _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+ 4), + _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+ 6), + _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+ 8), + _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+10), + _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+12), + _mm_setzero_si128()); + _mm_storeu_si128(M128_CAST(sub_msgs+14), + _mm_setzero_si128()); +} + +inline void init224(LSH512_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224); +} + +inline void init256(LSH512_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256); +} + +inline void init384(LSH512_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384); +} + +inline void init512(LSH512_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + zero_submsgs(ctx); + load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512); +} + +/* -------------------------------------------------------- */ + +inline void fin(LSH512_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + + _mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)), + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0)))); + _mm_storeu_si128(M128_CAST(ctx->cv_l+2), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+2)), + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+2)))); + _mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)), + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4)))); + _mm_storeu_si128(M128_CAST(ctx->cv_l+6), _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+6)), + _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+6)))); +} + +/* -------------------------------------------------------- */ + +inline void get_hash(LSH512_SSSE3_Context* ctx, lsh_u8* pbHashVal) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + CRYPTOPP_ASSERT(pbHashVal != NULLPTR); + + lsh_uint alg_type = ctx->alg_type; + lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); + lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); + + // Multiplying by sizeof(lsh_u8) looks odd... + memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); + if (hash_val_bit_len){ + pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); + } +} + +/* -------------------------------------------------------- */ + +lsh_err lsh512_init_ssse3(LSH512_SSSE3_Context* ctx) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + + lsh_u32 alg_type = ctx->alg_type; + const lsh_u64* const_v = NULL; + ctx->remain_databitlen = 0; + + switch (alg_type){ + case LSH_TYPE_512_512: + init512(ctx); + return LSH_SUCCESS; + case LSH_TYPE_512_384: + init384(ctx); + return LSH_SUCCESS; + case LSH_TYPE_512_256: + init256(ctx); + return LSH_SUCCESS; + case LSH_TYPE_512_224: + init224(ctx); + return LSH_SUCCESS; + default: + break; + } + + lsh_u64* cv_l = ctx->cv_l; + lsh_u64* cv_r = ctx->cv_r; + + zero_iv(cv_l, cv_r); + cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN; + cv_l[1] = LSH_GET_HASHBIT(alg_type); + + for (size_t i = 0; i < NUM_STEPS / 2; i++) + { + //Mix + load_sc(&const_v, i * 16); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + + load_sc(&const_v, i * 16 + 8); + mix(cv_l, cv_r, const_v); + word_perm(cv_l, cv_r); + } + + return LSH_SUCCESS; +} + +lsh_err lsh512_update_ssse3(LSH512_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(data != NULLPTR); + CRYPTOPP_ASSERT(databitlen % 8 == 0); + CRYPTOPP_ASSERT(ctx->alg_type != 0); + + if (databitlen == 0){ + return LSH_SUCCESS; + } + + // We are byte oriented. tail bits will always be 0. + size_t databytelen = databitlen >> 3; + // lsh_uint pos2 = databitlen & 0x7; + const size_t pos2 = 0; + + size_t remain_msg_byte = static_cast(ctx->remain_databitlen >> 3); + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; + + if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){ + return LSH_ERR_INVALID_STATE; + } + if (remain_msg_bit > 0){ + return LSH_ERR_INVALID_DATABITLEN; + } + + if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){ + memcpy(ctx->last_block + remain_msg_byte, data, databytelen); + ctx->remain_databitlen += (lsh_uint)databitlen; + remain_msg_byte += (lsh_uint)databytelen; + if (pos2){ + ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff); + } + return LSH_SUCCESS; + } + + if (remain_msg_byte > 0){ + size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte; + memcpy(ctx->last_block + remain_msg_byte, data, more_byte); + compress(ctx, ctx->last_block); + data += more_byte; + databytelen -= more_byte; + remain_msg_byte = 0; + ctx->remain_databitlen = 0; + } + + while (databytelen >= LSH512_MSG_BLK_BYTE_LEN) + { + // This call to compress caused some trouble. + // The data pointer can become unaligned in the + // previous block. + compress(ctx, data); + data += LSH512_MSG_BLK_BYTE_LEN; + databytelen -= LSH512_MSG_BLK_BYTE_LEN; + } + + if (databytelen > 0){ + memcpy(ctx->last_block, data, databytelen); + ctx->remain_databitlen = (lsh_uint)(databytelen << 3); + } + + if (pos2){ + ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff); + ctx->remain_databitlen += pos2; + } + return LSH_SUCCESS; +} + +lsh_err lsh512_final_ssse3(LSH512_SSSE3_Context* ctx, lsh_u8* hashval) +{ + CRYPTOPP_ASSERT(ctx != NULLPTR); + CRYPTOPP_ASSERT(hashval != NULLPTR); + + // We are byte oriented. tail bits will always be 0. + size_t remain_msg_byte = static_cast(ctx->remain_databitlen >> 3); + // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; + const size_t remain_msg_bit = 0; + + if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){ + return LSH_ERR_INVALID_STATE; + } + + if (remain_msg_bit){ + ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit)); + } + else{ + ctx->last_block[remain_msg_byte] = 0x80; + } + memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1); + + compress(ctx, ctx->last_block); + + fin(ctx); + get_hash(ctx, hashval); + + return LSH_SUCCESS; +} + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +extern +void LSH512_Base_Restart_SSSE3(word64* state) +{ + state[RemainingBits] = 0; + LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh512_init_ssse3(&ctx); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_ssse3 failed"); +} + +extern +void LSH512_Base_Update_SSSE3(word64* state, const byte *input, size_t size) +{ + LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh512_update_ssse3(&ctx, input, 8*size); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_ssse3 failed"); +} + +extern +void LSH512_Base_TruncatedFinal_SSSE3(word64* state, byte *hash, size_t) +{ + LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); + lsh_err err = lsh512_final_ssse3(&ctx, hash); + + if (err != LSH_SUCCESS) + throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_ssse3 failed"); +} + +NAMESPACE_END + +#endif // CRYPTOPP_SSSE3_AVAILABLE