Add LSH dynamic dispatch (PR #1032)

This commit adds dynamic dispatch to LSH. The implementation pivots on AVX2 and SSSE3.
This commit is contained in:
Jeffrey Walton 2021-04-26 04:50:48 -04:00 committed by GitHub
parent 21a40abc5c
commit a0e21c77ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 3667 additions and 1542 deletions

View File

@ -205,7 +205,11 @@ lea.cpp
lea_simd.cpp
lea.h
lsh256.cpp
lsh256_sse.cpp
lsh256_avx.cpp
lsh512.cpp
lsh512_sse.cpp
lsh512_avx.cpp
lsh.h
luc.cpp
luc.h

View File

@ -122,7 +122,9 @@ else ifeq ($(findstring clean,$(MAKECMDGOALS)),clean)
DETECT_FEATURES := 0
else ifeq ($(findstring distclean,$(MAKECMDGOALS)),distclean)
DETECT_FEATURES := 0
else ifeq ($(findstring distclean,$(MAKECMDGOALS)),trim)
else ifeq ($(findstring trim,$(MAKECMDGOALS)),trim)
DETECT_FEATURES := 0
else ifeq ($(findstring zip,$(MAKECMDGOALS)),zip)
DETECT_FEATURES := 0
endif
@ -230,7 +232,7 @@ endif # IS_MINGW
# Newlib needs _XOPEN_SOURCE=600 for signals
TPROG = TestPrograms/test_newlib.cpp
HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
ifeq ($(strip $(HAVE_OPT)),0)
ifeq ($(findstring -D_XOPEN_SOURCE,$(CXXFLAGS)),)
CRYPTOPP_CXXFLAGS += -D_XOPEN_SOURCE=600
@ -286,7 +288,9 @@ ifeq ($(DETECT_FEATURES),1)
CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_ASM
endif
# Need SSE2 or higher for these tests
ifneq ($(SSE2_FLAG),)
TPROG = TestPrograms/test_x86_ssse3.cpp
TOPT = $(SSSE3_FLAG)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
@ -295,6 +299,8 @@ ifeq ($(DETECT_FEATURES),1)
CHAM_FLAG = $(SSSE3_FLAG)
KECCAK_FLAG = $(SSSE3_FLAG)
LEA_FLAG = $(SSSE3_FLAG)
LSH256_FLAG = $(SSSE3_FLAG)
LSH512_FLAG = $(SSSE3_FLAG)
SIMON128_FLAG = $(SSSE3_FLAG)
SPECK128_FLAG = $(SSSE3_FLAG)
SUN_LDFLAGS += $(SSSE3_FLAG)
@ -302,6 +308,12 @@ ifeq ($(DETECT_FEATURES),1)
SSSE3_FLAG =
endif
# The first Apple MacBooks were Core2's with SSE4.1
ifneq ($(IS_DARWIN),0)
# Add SSE2 algo's here as required
# They get a free upgrade
endif
TPROG = TestPrograms/test_x86_sse41.cpp
TOPT = $(SSE41_FLAG)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
@ -360,6 +372,8 @@ ifeq ($(DETECT_FEATURES),1)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
ifeq ($(strip $(HAVE_OPT)),0)
CHACHA_AVX2_FLAG = $(AVX2_FLAG)
LSH256_AVX2_FLAG = $(AVX2_FLAG)
LSH512_AVX2_FLAG = $(AVX2_FLAG)
SUN_LDFLAGS += $(AVX2_FLAG)
else
AVX2_FLAG =
@ -420,7 +434,7 @@ ifeq ($(DETECT_FEATURES),1)
# CRYPTOPP_DISABLE_MIXED_ASM is now being added in config_asm.h for all
# Clang compilers. This test will need to be re-enabled if Clang fixes it.
#TPROG = TestPrograms/test_asm_mixed.cpp
#HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w)
#HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
#ifneq ($(strip $(HAVE_OPT)),0)
# CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_MIXED_ASM
#endif
@ -1057,7 +1071,7 @@ endif # Valgrind
# Newlib test due to http://sourceware.org/bugzilla/show_bug.cgi?id=20268
ifneq ($(filter -DDEBUG -DDEBUG=1,$(CXXFLAGS)),)
TPROG = TestPrograms/test_cxx.cpp
USING_GLIBCXX := $(shell $(CXX)$(CXXFLAGS) -E $(TPROG) -o $(TOUT) 2>&1 | $(GREP) -i -c "__GLIBCXX__")
USING_GLIBCXX := $(shell $(CXX)$(CXXFLAGS) -E $(TPROG) -c 2>&1 | $(GREP) -i -c "__GLIBCXX__")
ifneq ($(USING_GLIBCXX),0)
ifeq ($(HAS_NEWLIB),0)
ifeq ($(findstring -D_GLIBCXX_DEBUG,$(CXXFLAGS)),)
@ -1621,6 +1635,22 @@ keccak_simd.o : keccak_simd.cpp
lea_simd.o : lea_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LEA_FLAG) -c) $<
# SSSE3 available
lsh256_sse.o : lsh256_sse.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_FLAG) -c) $<
# AVX2 available
lsh256_avx.o : lsh256_avx.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_AVX2_FLAG) -c) $<
# SSSE3 available
lsh512_sse.o : lsh512_sse.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_FLAG) -c) $<
# AVX2 available
lsh512_avx.o : lsh512_avx.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_AVX2_FLAG) -c) $<
# NEON available
neon_simd.o : neon_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(NEON_FLAG) -c) $<

View File

@ -46,11 +46,11 @@ endif
IS_LINUX := $(shell echo $(MACHINEX) | $(GREP) -i -c "Linux")
# Can be used by Android and Embeeded cross-compiles. Disable by default because
# Can be used by Android and Embedded cross-compiles. Disable by default because
# Android and embedded users typically don't run this configuration.
HAS_SOLIB_VERSION ?= 0
# Formely adhoc.cpp was created from adhoc.cpp.proto when needed.
# Formerly adhoc.cpp was created from adhoc.cpp.proto when needed.
# This is now needed because ISA tests are performed using adhoc.cpp.
ifeq ($(wildcard adhoc.cpp),)
$(shell cp adhoc.cpp.proto adhoc.cpp)
@ -192,9 +192,9 @@ else ifeq ($(findstring clean,$(MAKECMDGOALS)),clean)
DETECT_FEATURES := 0
else ifeq ($(findstring distclean,$(MAKECMDGOALS)),distclean)
DETECT_FEATURES := 0
else ifeq ($(findstring distclean,$(MAKECMDGOALS)),trim)
else ifeq ($(findstring trim,$(MAKECMDGOALS)),trim)
DETECT_FEATURES := 0
else ifeq ($(IS_IOS),1)
else ifeq ($(findstring zip,$(MAKECMDGOALS)),zip)
DETECT_FEATURES := 0
endif
@ -249,6 +249,7 @@ ifeq ($(DETECT_FEATURES),1)
CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_ASM
endif
# Need SSE2 or higher for these tests
ifneq ($(SSE2_FLAG),)
TPROG = TestPrograms/test_x86_ssse3.cpp
TOPT = $(SSSE3_FLAG)
@ -258,20 +259,26 @@ ifeq ($(DETECT_FEATURES),1)
CHAM_FLAG = $(SSSE3_FLAG)
KECCAK_FLAG = $(SSSE3_FLAG)
LEA_FLAG = $(SSSE3_FLAG)
LSH256_FLAG = $(SSSE3_FLAG)
LSH512_FLAG = $(SSSE3_FLAG)
SIMON128_FLAG = $(SSSE3_FLAG)
SPECK128_FLAG = $(SSSE3_FLAG)
SUN_LDFLAGS += $(SSSE3_FLAG)
else
SSSE3_FLAG =
endif
# The first Apple MacBooks were Core2's with SSE4.1
ifneq ($(IS_DARWIN),0)
# Add SSE2 algo's here as required
# They get a free upgrade
endif
TPROG = TestPrograms/test_x86_sse41.cpp
TOPT = $(SSE41_FLAG)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
ifeq ($(strip $(HAVE_OPT)),0)
BLAKE2B_FLAG = $(SSE41_FLAG)
BLAKE2S_FLAG = $(SSE41_FLAG)
SUN_LDFLAGS += $(SSE41_FLAG)
else
SSE41_FLAG =
endif
@ -281,7 +288,6 @@ ifeq ($(DETECT_FEATURES),1)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
ifeq ($(strip $(HAVE_OPT)),0)
CRC_FLAG = $(SSE42_FLAG)
SUN_LDFLAGS += $(SSE42_FLAG)
else
SSE42_FLAG =
endif
@ -292,7 +298,6 @@ ifeq ($(DETECT_FEATURES),1)
ifeq ($(strip $(HAVE_OPT)),0)
GCM_FLAG = $(SSSE3_FLAG) $(CLMUL_FLAG)
GF2N_FLAG = $(CLMUL_FLAG)
SUN_LDFLAGS += $(CLMUL_FLAG)
else
CLMUL_FLAG =
endif
@ -303,7 +308,6 @@ ifeq ($(DETECT_FEATURES),1)
ifeq ($(strip $(HAVE_OPT)),0)
AES_FLAG = $(SSE41_FLAG) $(AESNI_FLAG)
SM4_FLAG = $(SSSE3_FLAG) $(AESNI_FLAG)
SUN_LDFLAGS += $(AESNI_FLAG)
else
AESNI_FLAG =
endif
@ -313,7 +317,6 @@ ifeq ($(DETECT_FEATURES),1)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
ifeq ($(strip $(HAVE_OPT)),0)
# XXX_FLAG = $(AVX_FLAG)
SUN_LDFLAGS += $(AVX_FLAG)
else
AVX_FLAG =
endif
@ -323,7 +326,8 @@ ifeq ($(DETECT_FEATURES),1)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
ifeq ($(strip $(HAVE_OPT)),0)
CHACHA_AVX2_FLAG = $(AVX2_FLAG)
SUN_LDFLAGS += $(AVX2_FLAG)
LSH256_AVX2_FLAG = $(AVX2_FLAG)
LSH512_AVX2_FLAG = $(AVX2_FLAG)
else
AVX2_FLAG =
endif
@ -333,15 +337,10 @@ ifeq ($(DETECT_FEATURES),1)
HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
ifeq ($(strip $(HAVE_OPT)),0)
SHA_FLAG = $(SSE42_FLAG) $(SHANI_FLAG)
SUN_LDFLAGS += $(SHANI_FLAG)
else
SHANI_FLAG =
endif
ifeq ($(SUN_COMPILER),1)
CRYPTOPP_LDFLAGS += $(SUN_LDFLAGS)
endif
ifeq ($(SSE3_FLAG),)
CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_SSE3
else ifeq ($(SSSE3_FLAG),)
@ -383,7 +382,7 @@ ifeq ($(DETECT_FEATURES),1)
# CRYPTOPP_DISABLE_MIXED_ASM is now being added in config_asm.h for all
# Clang compilers. This test will need to be re-enabled if Clang fixes it.
#TPROG = TestPrograms/test_asm_mixed.cpp
#HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TPROG) -o $(TOUT) 2>&1 | wc -w)
#HAVE_OPT = $(shell $(TCOMMAND) 2>&1 | wc -w)
#ifneq ($(strip $(HAVE_OPT)),0)
# CRYPTOPP_CXXFLAGS += -DCRYPTOPP_DISABLE_MIXED_ASM
#endif
@ -989,6 +988,22 @@ keccak_simd.o : keccak_simd.cpp
lea_simd.o : lea_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LEA_FLAG) -c) $<
# SSSE3 available
lsh256_sse.o : lsh256_sse.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_FLAG) -c) $<
# AVX2 available
lsh256_avx.o : lsh256_avx.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH256_AVX2_FLAG) -c) $<
# SSSE3 available
lsh512_sse.o : lsh512_sse.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_FLAG) -c) $<
# AVX2 available
lsh512_avx.o : lsh512_avx.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(LSH512_AVX2_FLAG) -c) $<
# NEON available
neon_simd.o : neon_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(NEON_FLAG) -c) $<

View File

@ -191,30 +191,4 @@
# pragma GCC diagnostic ignored "-Wunused-function"
#endif
// Requires ifunc support: GCC 4.8, Binutils 2.20.1 and libc 2.11.1.
// Should work for Clang 7 and above: https://stackoverflow.com/q/39958935,
// but fails with Clang 10: https://bugs.llvm.org/show_bug.cgi?id=50025.
// Should work with GCC 4.8.4 and 7.5.0 but does not:
// https://travis-ci.org/github/noloader/cryptopp-cmake/jobs/767701720 and
// https://travis-ci.org/github/noloader/cryptopp/jobs/767704226.
// Not available on Apple and Solaris platforms. Also see
// https://sourceware.org/glibc/wiki/GNU_IFUNC and
// https://gcc.gnu.org/onlinedocs/gcc/Function-Multiversioning.html.
#if !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__linux__)
# if defined(__i386__) || defined(__i686__) || defined(__amd64__)
# if (CRYPTOPP_GCC_VERSION >= 80000) || (CRYPTOPP_LLVM_CLANG_VERSION >= 130000)
# include <x86intrin.h>
# define CRYPTOPP_HAVE_ATTRIBUTE_TARGET 1
# define CRYPTOPP_TARGET_DEFAULT __attribute__ ((target ("default")))
# define CRYPTOPP_TARGET_SSSE3 __attribute__ ((target ("ssse3")))
# endif
# endif
# endif
#endif
#ifndef CRYPTOPP_TARGET_DEFAULT
# define CRYPTOPP_TARGET_DEFAULT
#endif
#endif // CRYPTOPP_CONFIG_MISC_H

View File

@ -70,20 +70,21 @@ LIB_SRCS = \
gfpcrypt.cpp gost.cpp gzip.cpp hc128.cpp hc256.cpp hex.cpp hight.cpp \
hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp \
kalynatab.cpp keccak.cpp keccak_core.cpp keccak_simd.cpp lea.cpp \
lea_simd.cpp lsh256.cpp lsh512.cpp luc.cpp mars.cpp marss.cpp md2.cpp \
md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp \
oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp \
polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabbit.cpp rabin.cpp \
randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp \
rijndael.cpp rijndael_simd.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp \
safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha.cpp \
sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp shake.cpp shark.cpp \
sharkbox.cpp simeck.cpp simon.cpp simon128_simd.cpp skipjack.cpp sm3.cpp \
sm4.cpp sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp \
square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \
threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \
vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp \
zdeflate.cpp zinflate.cpp zlib.cpp
lea_simd.cpp lsh256.cpp lsh256_avx.cpp lsh256_sse.cpp lsh512.cpp \
lsh512_avx.cpp lsh512_sse.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp \
md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp oaep.cpp \
osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp \
pssr.cpp pubkey.cpp queue.cpp rabbit.cpp rabin.cpp randpool.cpp rc2.cpp \
rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp rijndael_simd.cpp \
ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp \
seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp \
shacal2_simd.cpp shake.cpp shark.cpp sharkbox.cpp simeck.cpp simon.cpp \
simon128_simd.cpp skipjack.cpp sm3.cpp sm4.cpp sm4_simd.cpp \
sosemanuk.cpp speck.cpp speck128_simd.cpp square.cpp squaretb.cpp \
sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp \
tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wake.cpp \
whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp zdeflate.cpp \
zinflate.cpp zlib.cpp
LIB_OBJS = \
cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj \
@ -100,20 +101,21 @@ LIB_OBJS = \
gfpcrypt.obj gost.obj gzip.obj hc128.obj hc256.obj hex.obj hight.obj \
hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj \
kalynatab.obj keccak.obj keccak_core.obj keccak_simd.obj lea.obj \
lea_simd.obj lsh256.obj lsh512.obj luc.obj mars.obj marss.obj md2.obj \
md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj \
oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj \
polynomi.obj pssr.obj pubkey.obj queue.obj rabbit.obj rabin.obj \
randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj \
rijndael.obj rijndael_simd.obj ripemd.obj rng.obj rsa.obj rw.obj \
safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha.obj \
sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj shake.obj shark.obj \
sharkbox.obj simeck.obj simon.obj simon128_simd.obj skipjack.obj sm3.obj \
sm4.obj sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj \
square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \
threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \
vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj \
zdeflate.obj zinflate.obj zlib.obj
lea_simd.obj lsh256.obj lsh256_avx.obj lsh256_sse.obj lsh512.obj \
lsh512_avx.obj lsh512_sse.obj luc.obj mars.obj marss.obj md2.obj md4.obj \
md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj oaep.obj \
osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj \
pssr.obj pubkey.obj queue.obj rabbit.obj rabin.obj randpool.obj rc2.obj \
rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael.obj rijndael_simd.obj \
ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj \
seal.obj seed.obj serpent.obj sha.obj sha3.obj sha_simd.obj shacal2.obj \
shacal2_simd.obj shake.obj shark.obj sharkbox.obj simeck.obj simon.obj \
simon128_simd.obj skipjack.obj sm3.obj sm4.obj sm4_simd.obj \
sosemanuk.obj speck.obj speck128_simd.obj square.obj squaretb.obj \
sse_simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj \
tigertab.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wake.obj \
whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj zdeflate.obj \
zinflate.obj zlib.obj
ASM_OBJS = \
rdrand-x86.obj rdrand-x64.obj rdseed-x86.obj rdseed-x64.obj x64masm.obj x64dll.obj
@ -311,6 +313,10 @@ x64dll.obj: x64dll.asm
!IF "$(PLATFORM)" == "x64" || "$(PLATFORM)" == "X64" || "$(PLATFORM)" == "amd64" || "$(PLATFORM)" == "x86" || "$(PLATFORM)" == "X86"
chacha_avx.obj:
$(CXX) $(CXXFLAGS) /arch:AVX /c chacha_avx.cpp
lsh256_avx.obj:
$(CXX) $(CXXFLAGS) /arch:AVX /c lsh256_avx.cpp
lsh512_avx.obj:
$(CXX) $(CXXFLAGS) /arch:AVX /c lsh512_avx.cpp
!endif
# For testing cryptopp.dll and CRYPTOPP_IMPORTS

View File

@ -263,7 +263,11 @@
<ClCompile Include="lea.cpp" />
<ClCompile Include="lea_simd.cpp" />
<ClCompile Include="lsh256.cpp" />
<ClCompile Include="lsh256_sse.cpp" />
<ClCompile Include="lsh256_avx.cpp" />
<ClCompile Include="lsh512.cpp" />
<ClCompile Include="lsh512_sse.cpp" />
<ClCompile Include="lsh512_avx.cpp" />
<ClCompile Include="luc.cpp" />
<ClCompile Include="mars.cpp" />
<ClCompile Include="marss.cpp" />

View File

@ -275,9 +275,21 @@
<ClCompile Include="lsh256.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="lsh256_sse.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="lsh256_avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="lsh512.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="lsh512_sse.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="lsh512_avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="luc.cpp">
<Filter>Source Files</Filter>
</ClCompile>

View File

@ -15,7 +15,7 @@
// GCC inline assembly or the builtin will fail the compile.
// Inline assembler available in GCC 3.2 or above. For practical
// purposes we check for GCC 4.0 or above. GCC imposters claim
// purposes we check for GCC 4.0 or above. GCC impostors claim
// to be GCC 4.2.1 so it will capture them, too. We exclude the
// Apple machines because they are not Power9 and use a slightly
// different syntax in their assembler.

View File

@ -241,15 +241,15 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
}
else if (s1.substr(0, 2) == "0x")
{
std::string::size_type pos = s1.find(' ');
StringSource(s1.substr(2, pos), true, new HexDecoder(new StringSink(s2)));
s1 = s1.substr(STDMIN(pos, s1.length()));
std::string::size_type n = s1.find(' ');
StringSource(s1.substr(2, n), true, new HexDecoder(new StringSink(s2)));
s1 = s1.substr(STDMIN(n, s1.length()));
}
else
{
std::string::size_type pos = s1.find(' ');
StringSource(s1.substr(0, pos), true, new HexDecoder(new StringSink(s2)));
s1 = s1.substr(STDMIN(pos, s1.length()));
std::string::size_type n = s1.find(' ');
StringSource(s1.substr(0, n), true, new HexDecoder(new StringSink(s2)));
s1 = s1.substr(STDMIN(n, s1.length()));
}
while (repeat--)
@ -850,8 +850,8 @@ void TestSymmetricCipherWithFileSource(TestData &v, const NameValuePairs &overri
if (encrypted != ciphertext)
{
std::cout << "\nincorrectly encrypted: ";
StringSource ss(encrypted, false, new HexEncoder(new FileSink(std::cout)));
ss.Pump(2048); ss.Flush(false);
StringSource sss(encrypted, false, new HexEncoder(new FileSink(std::cout)));
sss.Pump(2048); sss.Flush(false);
std::cout << "\n";
SignalTestFailure();
}
@ -867,8 +867,8 @@ void TestSymmetricCipherWithFileSource(TestData &v, const NameValuePairs &overri
if (decrypted != plaintext)
{
std::cout << "\nincorrectly decrypted: ";
StringSource ss(decrypted, false, new HexEncoder(new FileSink(std::cout)));
ss.Pump(256); ss.Flush(false);
StringSource sss(decrypted, false, new HexEncoder(new FileSink(std::cout)));
sss.Pump(256); sss.Flush(false);
std::cout << "\n";
SignalTestFailure();
}

31
lsh.h
View File

@ -4,6 +4,11 @@
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
// makes using zeroupper a little tricky.
/// \file lsh.h
/// \brief Classes for the LSH hash functions
/// \since Crypto++ 8.6
@ -15,6 +20,12 @@
#include "cryptlib.h"
#include "secblock.h"
// Enable SSE2 and AVX2 for 64-bit machines.
// 32-bit machines slow down with SSE2.
#if (CRYPTOPP_BOOL_X32) || (CRYPTOPP_BOOL_X64)
# define CRYPTOPP_ENABLE_64BIT_SSE 1
#endif
NAMESPACE_BEGIN(CryptoPP)
/// \brief LSH-224 and LSH-256 hash base class
@ -34,14 +45,14 @@ public:
unsigned int OptimalDataAlignment() const { return GetAlignmentOf<word32>(); }
void Restart();
void Update(const byte *input, size_t length);
void Update(const byte *input, size_t size);
void TruncatedFinal(byte *hash, size_t size);
std::string AlgorithmProvider() const;
protected:
LSH256_Base(unsigned int algType, unsigned int digestSize)
: m_algType(algType), m_digestSize(digestSize) {}
: m_digestSize(digestSize) { m_state[80] = algType; }
protected:
// Working state is:
@ -52,8 +63,10 @@ protected:
// * submsg_o_l = 8 32-bit words
// * submsg_o_r = 8 32-bit words
// * last_block = 32 32-bit words (128 bytes)
FixedSizeSecBlock<word32, 80> m_state;
word32 m_algType, m_remainingBitLength;
// * algType
// * remainingBitLength
FixedSizeSecBlock<word32, 80+2> m_state;
// word32 m_algType, m_remainingBitLength;
word32 m_digestSize;
};
@ -132,14 +145,14 @@ public:
unsigned int OptimalDataAlignment() const { return GetAlignmentOf<word64>(); }
void Restart();
void Update(const byte *input, size_t length);
void Update(const byte *input, size_t size);
void TruncatedFinal(byte *hash, size_t size);
std::string AlgorithmProvider() const;
protected:
LSH512_Base(unsigned int algType, unsigned int digestSize)
: m_algType(algType), m_digestSize(digestSize) {}
: m_digestSize(digestSize) { m_state[80] = algType; }
protected:
// Working state is:
@ -150,8 +163,10 @@ protected:
// * submsg_o_l = 8 64-bit words
// * submsg_o_r = 8 64-bit words
// * last_block = 32 64-bit words (256 bytes)
FixedSizeSecBlock<word64, 80> m_state;
word32 m_algType, m_remainingBitLength;
// * algType
// * remainingBitLength
FixedSizeSecBlock<word64, 80+2> m_state;
// word32 m_algType, m_remainingBitLength;
word32 m_digestSize;
};

File diff suppressed because it is too large Load Diff

647
lsh256_avx.cpp Normal file
View File

@ -0,0 +1,647 @@
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
// Based on the specification and source code provided by
// Korea Internet & Security Agency (KISA) website. Also
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
// makes using zeroupper a little tricky.
#include "pch.h"
#include "config.h"
#include "lsh.h"
#include "misc.h"
#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
#if defined(CRYPTOPP_AVX2_AVAILABLE)
# include <emmintrin.h>
# include <immintrin.h>
#endif
#if defined(__GNUC__) && defined(__amd64__)
# include <x86intrin.h>
#endif
ANONYMOUS_NAMESPACE_BEGIN
/* LSH Constants */
const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128;
// const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024;
// const unsigned int LSH256_CV_BYTE_LEN = 64;
const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32;
// const unsigned int MSG_BLK_WORD_LEN = 32;
const unsigned int CV_WORD_LEN = 16;
const unsigned int CONST_WORD_LEN = 8;
const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
// const unsigned int WORD_BIT_LEN = 32;
const unsigned int NUM_STEPS = 26;
const unsigned int ROT_EVEN_ALPHA = 29;
const unsigned int ROT_EVEN_BETA = 1;
const unsigned int ROT_ODD_ALPHA = 5;
const unsigned int ROT_ODD_BETA = 17;
const unsigned int LSH_TYPE_256_256 = 0x0000020;
const unsigned int LSH_TYPE_256_224 = 0x000001C;
// const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224;
// const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256;
/* Error Code */
const unsigned int LSH_SUCCESS = 0x0;
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
/* Index into our state array */
const unsigned int AlgorithmType = 80;
const unsigned int RemainingBits = 81;
NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(LSH)
// lsh256.cpp
extern const word32 LSH256_IV224[CV_WORD_LEN];
extern const word32 LSH256_IV256[CV_WORD_LEN];
extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS];
NAMESPACE_END // LSH
NAMESPACE_END // Crypto++
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::byte;
using CryptoPP::word32;
using CryptoPP::rotlFixed;
using CryptoPP::rotlConstant;
using CryptoPP::GetBlock;
using CryptoPP::LittleEndian;
using CryptoPP::ConditionalByteReverse;
using CryptoPP::LITTLE_ENDIAN_ORDER;
typedef byte lsh_u8;
typedef word32 lsh_u32;
typedef word32 lsh_uint;
typedef word32 lsh_err;
typedef word32 lsh_type;
using CryptoPP::LSH::LSH256_IV224;
using CryptoPP::LSH::LSH256_IV256;
using CryptoPP::LSH::LSH256_StepConstants;
struct LSH256_AVX2_Context
{
LSH256_AVX2_Context(word32* state, word32 algType, word32& remainingBitLength) :
cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
last_block(reinterpret_cast<byte*>(state+48)),
remain_databitlen(remainingBitLength),
alg_type(static_cast<lsh_type>(algType)) {}
lsh_u32* cv_l; // start of our state block
lsh_u32* cv_r;
lsh_u32* sub_msgs;
lsh_u8* last_block;
lsh_u32& remain_databitlen;
lsh_type alg_type;
};
struct LSH256_AVX2_Internal
{
LSH256_AVX2_Internal(word32* state) :
submsg_e_l(state+16), submsg_e_r(state+24),
submsg_o_l(state+32), submsg_o_r(state+40) { }
lsh_u32* submsg_e_l; /* even left sub-message */
lsh_u32* submsg_e_r; /* even right sub-message */
lsh_u32* submsg_o_l; /* odd left sub-message */
lsh_u32* submsg_o_r; /* odd right sub-message */
};
// Zero the upper 128 bits of all YMM registers on exit.
// It avoids AVX state transition penalties when saving state.
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
// makes using zeroupper a little tricky.
struct AVX_Cleanup
{
~AVX_Cleanup() {
_mm256_zeroupper();
}
};
// const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
/* LSH AlgType Macro */
inline bool LSH_IS_LSH512(lsh_uint val) {
return (val & 0xf0000) == 0;
}
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
return val >> 24;
}
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
return val & 0xffff;
}
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
}
inline lsh_u32 loadLE32(lsh_u32 v) {
return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
}
lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) {
return rotlFixed(x, r);
}
// Original code relied upon unaligned lsh_u32 buffer
inline void load_msg_blk(LSH256_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN])
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_e_l = i_state->submsg_e_l;
lsh_u32* submsg_e_r = i_state->submsg_e_r;
lsh_u32* submsg_o_l = i_state->submsg_o_l;
lsh_u32* submsg_o_r = i_state->submsg_o_r;
_mm256_storeu_si256(M256_CAST(submsg_e_l+0),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
_mm256_storeu_si256(M256_CAST(submsg_e_r+0),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
_mm256_storeu_si256(M256_CAST(submsg_o_l+0),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
_mm256_storeu_si256(M256_CAST(submsg_o_r+0),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
}
inline void msg_exp_even(LSH256_AVX2_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_e_l = i_state->submsg_e_l;
lsh_u32* submsg_e_r = i_state->submsg_e_r;
lsh_u32* submsg_o_l = i_state->submsg_o_l;
lsh_u32* submsg_o_r = i_state->submsg_o_r;
const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514,
0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c);
_mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi32(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
_mm256_shuffle_epi8(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), mask)));
_mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi32(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
_mm256_shuffle_epi8(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), mask)));
}
inline void msg_exp_odd(LSH256_AVX2_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_e_l = i_state->submsg_e_l;
lsh_u32* submsg_e_r = i_state->submsg_e_r;
lsh_u32* submsg_o_l = i_state->submsg_o_l;
lsh_u32* submsg_o_r = i_state->submsg_o_r;
const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514,
0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c);
_mm256_storeu_si256(M256_CAST(submsg_o_l+0), _mm256_add_epi32(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
_mm256_shuffle_epi8(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), mask)));
_mm256_storeu_si256(M256_CAST(submsg_o_r+0), _mm256_add_epi32(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
_mm256_shuffle_epi8(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), mask)));
}
inline void load_sc(const lsh_u32** p_const_v, size_t i)
{
CRYPTOPP_ASSERT(p_const_v != NULLPTR);
*p_const_v = &LSH256_StepConstants[i];
}
inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_e_l = i_state->submsg_e_l;
lsh_u32* submsg_e_r = i_state->submsg_e_r;
_mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l+0)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0))));
_mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0))));
}
inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_o_l = i_state->submsg_o_l;
lsh_u32* submsg_o_r = i_state->submsg_o_r;
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
_mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_r)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
}
inline void add_blk(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
{
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi32(
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
_mm256_loadu_si256(CONST_M256_CAST(cv_r))));
}
template <unsigned int R>
inline void rotate_blk(lsh_u32 cv[8])
{
_mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
_mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
_mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R)));
}
inline void xor_with_const(lsh_u32 cv_l[8], const lsh_u32 const_v[8])
{
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
_mm256_loadu_si256(CONST_M256_CAST(const_v))));
}
inline void rotate_msg_gamma(lsh_u32 cv_r[8])
{
// g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
_mm256_storeu_si256(M256_CAST(cv_r+0),
_mm256_shuffle_epi8(_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
_mm256_set_epi8(
/* hi lane */ 15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1,
/* lo lane */ 12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0)));
}
inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
{
__m256i temp = _mm256_shuffle_epi32(
_mm256_loadu_si256(CONST_M256_CAST(cv_l)), _MM_SHUFFLE(3,1,0,2));
_mm256_storeu_si256(M256_CAST(cv_r),
_mm256_shuffle_epi32(
_mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(1,2,3,0)));
_mm256_storeu_si256(M256_CAST(cv_l),
_mm256_permute2x128_si256(temp,
_mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,3,0,1)));
_mm256_storeu_si256(M256_CAST(cv_r),
_mm256_permute2x128_si256(temp,
_mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,2,0,0)));
};
/* -------------------------------------------------------- *
* step function
* -------------------------------------------------------- */
template <unsigned int Alpha, unsigned int Beta>
inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8])
{
add_blk(cv_l, cv_r);
rotate_blk<Alpha>(cv_l);
xor_with_const(cv_l, const_v);
add_blk(cv_r, cv_l);
rotate_blk<Beta>(cv_r);
add_blk(cv_l, cv_r);
rotate_msg_gamma(cv_r);
}
/* -------------------------------------------------------- *
* compression function
* -------------------------------------------------------- */
inline void compress(LSH256_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN])
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
LSH256_AVX2_Internal s_state(ctx->cv_l);
LSH256_AVX2_Internal* i_state = &s_state;
const lsh_u32* const_v = NULL;
lsh_u32* cv_l = ctx->cv_l;
lsh_u32* cv_r = ctx->cv_r;
load_msg_blk(i_state, pdMsgBlk);
msg_add_even(cv_l, cv_r, i_state);
load_sc(&const_v, 0);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
msg_add_odd(cv_l, cv_r, i_state);
load_sc(&const_v, 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
for (size_t i = 1; i < NUM_STEPS / 2; i++)
{
msg_exp_even(i_state);
msg_add_even(cv_l, cv_r, i_state);
load_sc(&const_v, 16 * i);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
msg_exp_odd(i_state);
msg_add_odd(cv_l, cv_r, i_state);
load_sc(&const_v, 16 * i + 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
}
msg_exp_even(i_state);
msg_add_even(cv_l, cv_r, i_state);
}
/* -------------------------------------------------------- */
inline void load_iv(word32 cv_l[8], word32 cv_r[8], const word32 iv[16])
{
// The IV's are 32-byte aligned so we can use aligned loads.
_mm256_storeu_si256(M256_CAST(cv_l+0),
_mm256_load_si256(CONST_M256_CAST(iv+0)));
_mm256_storeu_si256(M256_CAST(cv_r+0),
_mm256_load_si256(CONST_M256_CAST(iv+8)));
}
inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
{
_mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
}
inline void zero_submsgs(LSH256_AVX2_Context* ctx)
{
lsh_u32* sub_msgs = ctx->sub_msgs;
_mm256_storeu_si256(M256_CAST(sub_msgs+ 0), _mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(sub_msgs+ 8), _mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(sub_msgs+16), _mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(sub_msgs+24), _mm256_setzero_si256());
}
inline void init224(LSH256_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224);
}
inline void init256(LSH256_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256);
}
/* -------------------------------------------------------- */
inline void fin(LSH256_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
_mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
}
/* -------------------------------------------------------- */
inline void get_hash(LSH256_AVX2_Context* ctx, lsh_u8* pbHashVal)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
lsh_uint alg_type = ctx->alg_type;
lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
// Multiplying by looks odd...
memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
if (hash_val_bit_len){
pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
}
}
/* -------------------------------------------------------- */
lsh_err lsh256_init_avx2(LSH256_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
lsh_u32 alg_type = ctx->alg_type;
const lsh_u32* const_v = NULL;
ctx->remain_databitlen = 0;
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
AVX_Cleanup cleanup;
switch (alg_type)
{
case LSH_TYPE_256_256:
init256(ctx);
return LSH_SUCCESS;
case LSH_TYPE_256_224:
init224(ctx);
return LSH_SUCCESS;
default:
break;
}
lsh_u32* cv_l = ctx->cv_l;
lsh_u32* cv_r = ctx->cv_r;
zero_iv(cv_l, cv_r);
cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN;
cv_l[1] = LSH_GET_HASHBIT(alg_type);
for (size_t i = 0; i < NUM_STEPS / 2; i++)
{
//Mix
load_sc(&const_v, i * 16);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
load_sc(&const_v, i * 16 + 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
}
return LSH_SUCCESS;
}
lsh_err lsh256_update_avx2(LSH256_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(data != NULLPTR);
CRYPTOPP_ASSERT(databitlen % 8 == 0);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
AVX_Cleanup cleanup;
if (databitlen == 0){
return LSH_SUCCESS;
}
// We are byte oriented. tail bits will always be 0.
size_t databytelen = databitlen >> 3;
// lsh_uint pos2 = databitlen & 0x7;
const size_t pos2 = 0;
size_t remain_msg_byte = ctx->remain_databitlen >> 3;
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
const size_t remain_msg_bit = 0;
if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
return LSH_ERR_INVALID_STATE;
}
if (remain_msg_bit > 0){
return LSH_ERR_INVALID_DATABITLEN;
}
if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN)
{
memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
ctx->remain_databitlen += (lsh_uint)databitlen;
remain_msg_byte += (lsh_uint)databytelen;
if (pos2){
ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
}
return LSH_SUCCESS;
}
if (remain_msg_byte > 0){
size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte;
memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
compress(ctx, ctx->last_block);
data += more_byte;
databytelen -= more_byte;
remain_msg_byte = 0;
ctx->remain_databitlen = 0;
}
while (databytelen >= LSH256_MSG_BLK_BYTE_LEN)
{
// This call to compress caused some trouble.
// The data pointer can become unaligned in the
// previous block.
compress(ctx, data);
data += LSH256_MSG_BLK_BYTE_LEN;
databytelen -= LSH256_MSG_BLK_BYTE_LEN;
}
if (databytelen > 0){
memcpy(ctx->last_block, data, databytelen);
ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
}
if (pos2){
ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
ctx->remain_databitlen += pos2;
}
return LSH_SUCCESS;
}
lsh_err lsh256_final_avx2(LSH256_AVX2_Context* ctx, lsh_u8* hashval)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(hashval != NULLPTR);
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
AVX_Cleanup cleanup;
// We are byte oriented. tail bits will always be 0.
size_t remain_msg_byte = ctx->remain_databitlen >> 3;
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
const size_t remain_msg_bit = 0;
if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
return LSH_ERR_INVALID_STATE;
}
if (remain_msg_bit){
ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
}
else{
ctx->last_block[remain_msg_byte] = 0x80;
}
memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
compress(ctx, ctx->last_block);
fin(ctx);
get_hash(ctx, hashval);
return LSH_SUCCESS;
}
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
extern
void LSH256_Base_Restart_AVX2(word32* state)
{
state[RemainingBits] = 0;
LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh256_init_avx2(&ctx);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_init_avx2 failed");
}
extern
void LSH256_Base_Update_AVX2(word32* state, const byte *input, size_t size)
{
LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh256_update_avx2(&ctx, input, 8*size);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_update_avx2 failed");
}
extern
void LSH256_Base_TruncatedFinal_AVX2(word32* state, byte *hash, size_t)
{
LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh256_final_avx2(&ctx, hash);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_final_avx2 failed");
}
NAMESPACE_END
#endif // CRYPTOPP_AVX2_AVAILABLE

709
lsh256_sse.cpp Normal file
View File

@ -0,0 +1,709 @@
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
// Based on the specification and source code provided by
// Korea Internet & Security Agency (KISA) website. Also
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
// makes using zeroupper a little tricky.
#include "pch.h"
#include "config.h"
#include "lsh.h"
#include "cpu.h"
#include "misc.h"
#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
# include <emmintrin.h>
# include <tmmintrin.h>
#endif
#if defined(CRYPTOPP_XOP_AVAILABLE)
# include <ammintrin.h>
#endif
#if defined(__GNUC__) && defined(__amd64__)
# include <x86intrin.h>
#endif
ANONYMOUS_NAMESPACE_BEGIN
/* LSH Constants */
const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128;
// const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024;
// const unsigned int LSH256_CV_BYTE_LEN = 64;
const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32;
// const unsigned int MSG_BLK_WORD_LEN = 32;
const unsigned int CV_WORD_LEN = 16;
const unsigned int CONST_WORD_LEN = 8;
const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
// const unsigned int WORD_BIT_LEN = 32;
const unsigned int NUM_STEPS = 26;
const unsigned int ROT_EVEN_ALPHA = 29;
const unsigned int ROT_EVEN_BETA = 1;
const unsigned int ROT_ODD_ALPHA = 5;
const unsigned int ROT_ODD_BETA = 17;
const unsigned int LSH_TYPE_256_256 = 0x0000020;
const unsigned int LSH_TYPE_256_224 = 0x000001C;
// const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224;
// const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256;
/* Error Code */
const unsigned int LSH_SUCCESS = 0x0;
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
/* Index into our state array */
const unsigned int AlgorithmType = 80;
const unsigned int RemainingBits = 81;
NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(LSH)
// lsh256.cpp
extern const word32 LSH256_IV224[CV_WORD_LEN];
extern const word32 LSH256_IV256[CV_WORD_LEN];
extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS];
NAMESPACE_END // LSH
NAMESPACE_END // Crypto++
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::byte;
using CryptoPP::word32;
using CryptoPP::rotlFixed;
using CryptoPP::rotlConstant;
using CryptoPP::GetBlock;
using CryptoPP::LittleEndian;
using CryptoPP::ConditionalByteReverse;
using CryptoPP::LITTLE_ENDIAN_ORDER;
typedef byte lsh_u8;
typedef word32 lsh_u32;
typedef word32 lsh_uint;
typedef word32 lsh_err;
typedef word32 lsh_type;
using CryptoPP::LSH::LSH256_IV224;
using CryptoPP::LSH::LSH256_IV256;
using CryptoPP::LSH::LSH256_StepConstants;
struct LSH256_SSSE3_Context
{
LSH256_SSSE3_Context(word32* state, word32 algType, word32& remainingBitLength) :
cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
last_block(reinterpret_cast<byte*>(state+48)),
remain_databitlen(remainingBitLength),
alg_type(static_cast<lsh_type>(algType)) {}
lsh_u32* cv_l; // start of our state block
lsh_u32* cv_r;
lsh_u32* sub_msgs;
lsh_u8* last_block;
lsh_u32& remain_databitlen;
lsh_type alg_type;
};
struct LSH256_SSSE3_Internal
{
LSH256_SSSE3_Internal(word32* state) :
submsg_e_l(state+16), submsg_e_r(state+24),
submsg_o_l(state+32), submsg_o_r(state+40) { }
lsh_u32* submsg_e_l; /* even left sub-message */
lsh_u32* submsg_e_r; /* even right sub-message */
lsh_u32* submsg_o_l; /* odd left sub-message */
lsh_u32* submsg_o_r; /* odd right sub-message */
};
const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
/* LSH AlgType Macro */
inline bool LSH_IS_LSH512(lsh_uint val) {
return (val & 0xf0000) == 0;
}
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
return val >> 24;
}
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
return val & 0xffff;
}
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
}
inline lsh_u32 loadLE32(lsh_u32 v) {
return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
}
lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) {
return rotlFixed(x, r);
}
// Original code relied upon unaligned lsh_u32 buffer
inline void load_msg_blk(LSH256_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN])
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_e_l = i_state->submsg_e_l;
lsh_u32* submsg_e_r = i_state->submsg_e_r;
lsh_u32* submsg_o_l = i_state->submsg_o_l;
lsh_u32* submsg_o_r = i_state->submsg_o_r;
_mm_storeu_si128(M128_CAST(submsg_e_l+0),
_mm_loadu_si128(CONST_M128_CAST(msgblk+0)));
_mm_storeu_si128(M128_CAST(submsg_e_l+4),
_mm_loadu_si128(CONST_M128_CAST(msgblk+16)));
_mm_storeu_si128(M128_CAST(submsg_e_r+0),
_mm_loadu_si128(CONST_M128_CAST(msgblk+32)));
_mm_storeu_si128(M128_CAST(submsg_e_r+4),
_mm_loadu_si128(CONST_M128_CAST(msgblk+48)));
_mm_storeu_si128(M128_CAST(submsg_o_l+0),
_mm_loadu_si128(CONST_M128_CAST(msgblk+64)));
_mm_storeu_si128(M128_CAST(submsg_o_l+4),
_mm_loadu_si128(CONST_M128_CAST(msgblk+80)));
_mm_storeu_si128(M128_CAST(submsg_o_r+0),
_mm_loadu_si128(CONST_M128_CAST(msgblk+96)));
_mm_storeu_si128(M128_CAST(submsg_o_r+4),
_mm_loadu_si128(CONST_M128_CAST(msgblk+112)));
}
inline void msg_exp_even(LSH256_SSSE3_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_e_l = i_state->submsg_e_l;
lsh_u32* submsg_e_r = i_state->submsg_e_r;
lsh_u32* submsg_o_l = i_state->submsg_o_l;
lsh_u32* submsg_o_r = i_state->submsg_o_r;
_mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi32(
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(1,0,2,3))));
_mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi32(
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(2,1,0,3))));
_mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi32(
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(1,0,2,3))));
_mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi32(
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(2,1,0,3))));
}
inline void msg_exp_odd(LSH256_SSSE3_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_e_l = i_state->submsg_e_l;
lsh_u32* submsg_e_r = i_state->submsg_e_r;
lsh_u32* submsg_o_l = i_state->submsg_o_l;
lsh_u32* submsg_o_r = i_state->submsg_o_r;
_mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi32(
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(1,0,2,3))));
_mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi32(
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(2,1,0,3))));
_mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi32(
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(1,0,2,3))));
_mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi32(
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(2,1,0,3))));
}
inline void load_sc(const lsh_u32** p_const_v, size_t i)
{
CRYPTOPP_ASSERT(p_const_v != NULLPTR);
*p_const_v = &LSH256_StepConstants[i];
}
inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_e_l = i_state->submsg_e_l;
lsh_u32* submsg_e_r = i_state->submsg_e_r;
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+0)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0))));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0))));
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
}
inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u32* submsg_o_l = i_state->submsg_o_l;
lsh_u32* submsg_o_r = i_state->submsg_o_r;
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l))));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
_mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r))));
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
}
inline void add_blk(lsh_u32 cv_l[8], const lsh_u32 cv_r[8])
{
_mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi32(
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
_mm_loadu_si128(CONST_M128_CAST(cv_r))));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi32(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(cv_r+4))));
}
template <unsigned int R>
inline void rotate_blk(lsh_u32 cv[8])
{
#if defined(CRYPTOPP_XOP_AVAILABLE)
_mm_storeu_si128(M128_CAST(cv),
_mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R));
_mm_storeu_si128(M128_CAST(cv+4),
_mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R));
#else
_mm_storeu_si128(M128_CAST(cv), _mm_or_si128(
_mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R),
_mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), 32-R)));
_mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128(
_mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R),
_mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 32-R)));
#endif
}
inline void xor_with_const(lsh_u32* cv_l, const lsh_u32* const_v)
{
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
_mm_loadu_si128(CONST_M128_CAST(const_v))));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(const_v+4))));
}
inline void rotate_msg_gamma(lsh_u32 cv_r[8])
{
// g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
_mm_storeu_si128(M128_CAST(cv_r+0),
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
_mm_set_epi8(12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0)));
_mm_storeu_si128(M128_CAST(cv_r+4),
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
_mm_set_epi8(15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1)));
}
inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
{
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
__m128i temp = _mm_loadu_si128(CONST_M128_CAST(cv_l+0));
_mm_storeu_si128(M128_CAST(cv_l+0),
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)));
_mm_storeu_si128(M128_CAST(cv_l+4),
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)));
_mm_storeu_si128(M128_CAST(cv_r+4),
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)));
_mm_storeu_si128(M128_CAST(cv_r+0), temp);
};
/* -------------------------------------------------------- *
* step function
* -------------------------------------------------------- */
template <unsigned int Alpha, unsigned int Beta>
inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8])
{
add_blk(cv_l, cv_r);
rotate_blk<Alpha>(cv_l);
xor_with_const(cv_l, const_v);
add_blk(cv_r, cv_l);
rotate_blk<Beta>(cv_r);
add_blk(cv_l, cv_r);
rotate_msg_gamma(cv_r);
}
/* -------------------------------------------------------- *
* compression function
* -------------------------------------------------------- */
inline void compress(LSH256_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN])
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
LSH256_SSSE3_Internal s_state(ctx->cv_l);
LSH256_SSSE3_Internal* i_state = &s_state;
const lsh_u32* const_v = NULL;
lsh_u32* cv_l = ctx->cv_l;
lsh_u32* cv_r = ctx->cv_r;
load_msg_blk(i_state, pdMsgBlk);
msg_add_even(cv_l, cv_r, i_state);
load_sc(&const_v, 0);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
msg_add_odd(cv_l, cv_r, i_state);
load_sc(&const_v, 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
for (size_t i = 1; i < NUM_STEPS / 2; i++)
{
msg_exp_even(i_state);
msg_add_even(cv_l, cv_r, i_state);
load_sc(&const_v, 16 * i);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
msg_exp_odd(i_state);
msg_add_odd(cv_l, cv_r, i_state);
load_sc(&const_v, 16 * i + 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
}
msg_exp_even(i_state);
msg_add_even(cv_l, cv_r, i_state);
}
/* -------------------------------------------------------- */
inline void load_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 iv[16])
{
_mm_storeu_si128(M128_CAST(cv_l+ 0),
_mm_load_si128(CONST_M128_CAST(iv+ 0)));
_mm_storeu_si128(M128_CAST(cv_l+ 4),
_mm_load_si128(CONST_M128_CAST(iv+ 4)));
_mm_storeu_si128(M128_CAST(cv_r+ 0),
_mm_load_si128(CONST_M128_CAST(iv+ 8)));
_mm_storeu_si128(M128_CAST(cv_r+ 4),
_mm_load_si128(CONST_M128_CAST(iv+12)));
}
inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
{
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128());
}
inline void zero_submsgs(LSH256_SSSE3_Context* ctx)
{
lsh_u32* sub_msgs = ctx->sub_msgs;
_mm_storeu_si128(M128_CAST(sub_msgs+ 0), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+ 4), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+ 8), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+12), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+16), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+20), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+24), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+28), _mm_setzero_si128());
}
inline void init224(LSH256_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224);
}
inline void init256(LSH256_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256);
}
/* -------------------------------------------------------- */
inline void fin(LSH256_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
_mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)),
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0))));
_mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4))));
}
/* -------------------------------------------------------- */
inline void get_hash(LSH256_SSSE3_Context* ctx, lsh_u8* pbHashVal)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
lsh_uint alg_type = ctx->alg_type;
lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
// Multiplying by sizeof(lsh_u8) looks odd...
memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
if (hash_val_bit_len){
pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
}
}
/* -------------------------------------------------------- */
lsh_err lsh256_ssse3_init(LSH256_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
lsh_u32 alg_type = ctx->alg_type;
const lsh_u32* const_v = NULL;
ctx->remain_databitlen = 0;
switch (alg_type)
{
case LSH_TYPE_256_256:
init256(ctx);
return LSH_SUCCESS;
case LSH_TYPE_256_224:
init224(ctx);
return LSH_SUCCESS;
default:
break;
}
lsh_u32* cv_l = ctx->cv_l;
lsh_u32* cv_r = ctx->cv_r;
zero_iv(cv_l, cv_r);
cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN;
cv_l[1] = LSH_GET_HASHBIT(alg_type);
for (size_t i = 0; i < NUM_STEPS / 2; i++)
{
//Mix
load_sc(&const_v, i * 16);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
load_sc(&const_v, i * 16 + 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
}
return LSH_SUCCESS;
}
lsh_err lsh256_ssse3_update(LSH256_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(data != NULLPTR);
CRYPTOPP_ASSERT(databitlen % 8 == 0);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
if (databitlen == 0){
return LSH_SUCCESS;
}
// We are byte oriented. tail bits will always be 0.
size_t databytelen = databitlen >> 3;
// lsh_uint pos2 = databitlen & 0x7;
const size_t pos2 = 0;
size_t remain_msg_byte = ctx->remain_databitlen >> 3;
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
const size_t remain_msg_bit = 0;
if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
return LSH_ERR_INVALID_STATE;
}
if (remain_msg_bit > 0){
return LSH_ERR_INVALID_DATABITLEN;
}
if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN)
{
memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
ctx->remain_databitlen += (lsh_uint)databitlen;
remain_msg_byte += (lsh_uint)databytelen;
if (pos2){
ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
}
return LSH_SUCCESS;
}
if (remain_msg_byte > 0){
size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte;
memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
compress(ctx, ctx->last_block);
data += more_byte;
databytelen -= more_byte;
remain_msg_byte = 0;
ctx->remain_databitlen = 0;
}
while (databytelen >= LSH256_MSG_BLK_BYTE_LEN)
{
// This call to compress caused some trouble.
// The data pointer can become unaligned in the
// previous block.
compress(ctx, data);
data += LSH256_MSG_BLK_BYTE_LEN;
databytelen -= LSH256_MSG_BLK_BYTE_LEN;
}
if (databytelen > 0){
memcpy(ctx->last_block, data, databytelen);
ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
}
if (pos2){
ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
ctx->remain_databitlen += pos2;
}
return LSH_SUCCESS;
}
lsh_err lsh256_ssse3_final(LSH256_SSSE3_Context* ctx, lsh_u8* hashval)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(hashval != NULLPTR);
// We are byte oriented. tail bits will always be 0.
size_t remain_msg_byte = ctx->remain_databitlen >> 3;
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
const size_t remain_msg_bit = 0;
if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
return LSH_ERR_INVALID_STATE;
}
if (remain_msg_bit){
ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
}
else{
ctx->last_block[remain_msg_byte] = 0x80;
}
memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
compress(ctx, ctx->last_block);
fin(ctx);
get_hash(ctx, hashval);
return LSH_SUCCESS;
}
ANONYMOUS_NAMESPACE_END // Anonymous
NAMESPACE_BEGIN(CryptoPP)
extern
void LSH256_Base_Restart_SSSE3(word32* state)
{
state[RemainingBits] = 0;
LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh256_ssse3_init(&ctx);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_init failed");
}
extern
void LSH256_Base_Update_SSSE3(word32* state, const byte *input, size_t size)
{
LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh256_ssse3_update(&ctx, input, 8*size);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_update failed");
}
extern
void LSH256_Base_TruncatedFinal_SSSE3(word32* state, byte *hash, size_t)
{
LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh256_ssse3_final(&ctx, hash);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_final failed");
}
NAMESPACE_END
#endif // CRYPTOPP_SSSE3_AVAILABLE

1100
lsh512.cpp

File diff suppressed because it is too large Load Diff

762
lsh512_avx.cpp Normal file
View File

@ -0,0 +1,762 @@
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
// Based on the specification and source code provided by
// Korea Internet & Security Agency (KISA) website. Also
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
// makes using zeroupper a little tricky.
#include "pch.h"
#include "config.h"
#include "lsh.h"
#include "misc.h"
#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
# include <emmintrin.h>
#endif
#if defined(CRYPTOPP_AVX2_AVAILABLE)
# include <immintrin.h>
#endif
#if defined(__GNUC__) && defined(__amd64__)
# include <x86intrin.h>
#endif
ANONYMOUS_NAMESPACE_BEGIN
/* LSH Constants */
const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256;
// const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048;
// const unsigned int LSH512_CV_BYTE_LEN = 128;
const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64;
// const unsigned int MSG_BLK_WORD_LEN = 32;
const unsigned int CV_WORD_LEN = 16;
const unsigned int CONST_WORD_LEN = 8;
const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
const unsigned int NUM_STEPS = 28;
const unsigned int ROT_EVEN_ALPHA = 23;
const unsigned int ROT_EVEN_BETA = 59;
const unsigned int ROT_ODD_ALPHA = 7;
const unsigned int ROT_ODD_BETA = 3;
const unsigned int LSH_TYPE_512_512 = 0x0010040;
const unsigned int LSH_TYPE_512_384 = 0x0010030;
const unsigned int LSH_TYPE_512_256 = 0x0010020;
const unsigned int LSH_TYPE_512_224 = 0x001001C;
// const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384;
// const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512;
/* Error Code */
const unsigned int LSH_SUCCESS = 0x0;
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
/* Index into our state array */
const unsigned int AlgorithmType = 80;
const unsigned int RemainingBits = 81;
NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(LSH)
// lsh512.cpp
extern const word64 LSH512_IV224[CV_WORD_LEN];
extern const word64 LSH512_IV256[CV_WORD_LEN];
extern const word64 LSH512_IV384[CV_WORD_LEN];
extern const word64 LSH512_IV512[CV_WORD_LEN];
extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS];
NAMESPACE_END // LSH
NAMESPACE_END // Crypto++
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::byte;
using CryptoPP::word32;
using CryptoPP::word64;
using CryptoPP::rotlFixed;
using CryptoPP::rotlConstant;
using CryptoPP::GetBlock;
using CryptoPP::LittleEndian;
using CryptoPP::ConditionalByteReverse;
using CryptoPP::LITTLE_ENDIAN_ORDER;
using CryptoPP::LSH::LSH512_IV224;
using CryptoPP::LSH::LSH512_IV256;
using CryptoPP::LSH::LSH512_IV384;
using CryptoPP::LSH::LSH512_IV512;
using CryptoPP::LSH::LSH512_StepConstants;
typedef byte lsh_u8;
typedef word32 lsh_u32;
typedef word64 lsh_u64;
typedef word32 lsh_uint;
typedef word32 lsh_err;
typedef word32 lsh_type;
struct LSH512_AVX2_Context
{
LSH512_AVX2_Context(word64* state, word64 algType, word64& remainingBitLength) :
cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
last_block(reinterpret_cast<byte*>(state+48)),
remain_databitlen(remainingBitLength),
alg_type(static_cast<lsh_type>(algType)) {}
lsh_u64* cv_l; // start of our state block
lsh_u64* cv_r;
lsh_u64* sub_msgs;
lsh_u8* last_block;
lsh_u64& remain_databitlen;
lsh_type alg_type;
};
struct LSH512_AVX2_Internal
{
LSH512_AVX2_Internal(word64* state) :
submsg_e_l(state+16), submsg_e_r(state+24),
submsg_o_l(state+32), submsg_o_r(state+40) { }
lsh_u64* submsg_e_l; /* even left sub-message */
lsh_u64* submsg_e_r; /* even right sub-message */
lsh_u64* submsg_o_l; /* odd left sub-message */
lsh_u64* submsg_o_r; /* odd right sub-message */
};
// Zero the upper 128 bits of all YMM registers on exit.
// It avoids AVX state transition penalties when saving state.
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
// makes using zeroupper a little tricky.
struct AVX_Cleanup
{
~AVX_Cleanup() {
_mm256_zeroupper();
}
};
// const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
/* LSH AlgType Macro */
inline bool LSH_IS_LSH512(lsh_uint val) {
return (val & 0xf0000) == 0x10000;
}
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
return val >> 24;
}
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
return val & 0xffff;
}
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
}
inline lsh_u64 loadLE64(lsh_u64 v) {
return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
}
lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
return rotlFixed(x, r);
}
// Original code relied upon unaligned lsh_u64 buffer
inline void load_msg_blk(LSH512_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN])
{
lsh_u64* submsg_e_l = i_state->submsg_e_l;
lsh_u64* submsg_e_r = i_state->submsg_e_r;
lsh_u64* submsg_o_l = i_state->submsg_o_l;
lsh_u64* submsg_o_r = i_state->submsg_o_r;
_mm256_storeu_si256(M256_CAST(submsg_e_l+0),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
_mm256_storeu_si256(M256_CAST(submsg_e_l+4),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
_mm256_storeu_si256(M256_CAST(submsg_e_r+0),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
_mm256_storeu_si256(M256_CAST(submsg_e_r+4),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
_mm256_storeu_si256(M256_CAST(submsg_o_l+0),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+128)));
_mm256_storeu_si256(M256_CAST(submsg_o_l+4),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+160)));
_mm256_storeu_si256(M256_CAST(submsg_o_r+0),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+192)));
_mm256_storeu_si256(M256_CAST(submsg_o_r+4),
_mm256_loadu_si256(CONST_M256_CAST(msgblk+224)));
}
inline void msg_exp_even(LSH512_AVX2_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u64* submsg_e_l = i_state->submsg_e_l;
lsh_u64* submsg_e_r = i_state->submsg_e_r;
lsh_u64* submsg_o_l = i_state->submsg_o_l;
lsh_u64* submsg_o_r = i_state->submsg_o_r;
_mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
_mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
_MM_SHUFFLE(1,0,2,3))));
_mm256_storeu_si256(M256_CAST(submsg_e_l+4), _mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
_mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
_MM_SHUFFLE(2,1,0,3))));
_mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
_mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
_MM_SHUFFLE(1,0,2,3))));
_mm256_storeu_si256(M256_CAST(submsg_e_r+4), _mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
_mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
_MM_SHUFFLE(2,1,0,3))));
}
inline void msg_exp_odd(LSH512_AVX2_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u64* submsg_e_l = i_state->submsg_e_l;
lsh_u64* submsg_e_r = i_state->submsg_e_r;
lsh_u64* submsg_o_l = i_state->submsg_o_l;
lsh_u64* submsg_o_r = i_state->submsg_o_r;
_mm256_storeu_si256(M256_CAST(submsg_o_l+0),
_mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
_mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
_MM_SHUFFLE(1,0,2,3))));
_mm256_storeu_si256(M256_CAST(submsg_o_l+4),
_mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
_mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
_MM_SHUFFLE(2,1,0,3))));
_mm256_storeu_si256(M256_CAST(submsg_o_r+0),
_mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
_mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
_MM_SHUFFLE(1,0,2,3))));
_mm256_storeu_si256(M256_CAST(submsg_o_r+4),
_mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
_mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
_MM_SHUFFLE(2,1,0,3))));
}
inline void load_sc(const lsh_u64** p_const_v, size_t i)
{
*p_const_v = &LSH512_StepConstants[i];
}
inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u64* submsg_e_l = i_state->submsg_e_l;
lsh_u64* submsg_e_r = i_state->submsg_e_r;
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l))));
_mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_r)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r))));
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4))));
_mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4))));
}
inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u64* submsg_o_l = i_state->submsg_o_l;
lsh_u64* submsg_o_r = i_state->submsg_o_r;
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
_mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_r)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4))));
_mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
_mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4))));
}
inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
{
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
_mm256_loadu_si256(CONST_M256_CAST(cv_r))));
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_add_epi64(
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4))));
}
template <unsigned int R>
inline void rotate_blk(lsh_u64 cv[8])
{
_mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
_mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
_mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
_mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
_mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
_mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
}
inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8])
{
_mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l)),
_mm256_loadu_si256(CONST_M256_CAST(const_v))));
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
_mm256_loadu_si256(CONST_M256_CAST(const_v+4))));
}
inline void rotate_msg_gamma(lsh_u64 cv_r[8])
{
// g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
_mm256_storeu_si256(M256_CAST(cv_r+0),
_mm256_shuffle_epi8(
_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
_mm256_set_epi8(
/* hi lane */ 9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4,
/* lo lane */ 13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0)));
_mm256_storeu_si256(M256_CAST(cv_r+4),
_mm256_shuffle_epi8(
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
_mm256_set_epi8(
/* hi lane */ 8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3,
/* lo lane */ 12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7)));
}
inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
{
__m256i temp[2];
_mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
_mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
_mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_permute4x64_epi64(
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
temp[0] = _mm256_loadu_si256(CONST_M256_CAST(cv_l+0));
temp[1] = _mm256_loadu_si256(CONST_M256_CAST(cv_r+0));
_mm256_storeu_si256(M256_CAST(cv_l+0),
_mm256_loadu_si256(CONST_M256_CAST(cv_l+4)));
_mm256_storeu_si256(M256_CAST(cv_l+4),
_mm256_loadu_si256(CONST_M256_CAST(cv_r+4)));
_mm256_storeu_si256(M256_CAST(cv_r+0), temp[0]);
_mm256_storeu_si256(M256_CAST(cv_r+4), temp[1]);
};
/* -------------------------------------------------------- *
* step function
* -------------------------------------------------------- */
template <unsigned int Alpha, unsigned int Beta>
inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8])
{
add_blk(cv_l, cv_r);
rotate_blk<Alpha>(cv_l);
xor_with_const(cv_l, const_v);
add_blk(cv_r, cv_l);
rotate_blk<Beta>(cv_r);
add_blk(cv_l, cv_r);
rotate_msg_gamma(cv_r);
}
/* -------------------------------------------------------- *
* compression function
* -------------------------------------------------------- */
inline void compress(LSH512_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN])
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
LSH512_AVX2_Internal s_state(ctx->cv_l);
LSH512_AVX2_Internal* i_state = &s_state;
const lsh_u64* const_v = NULL;
lsh_u64 *cv_l = ctx->cv_l;
lsh_u64 *cv_r = ctx->cv_r;
load_msg_blk(i_state, pdMsgBlk);
msg_add_even(cv_l, cv_r, i_state);
load_sc(&const_v, 0);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
msg_add_odd(cv_l, cv_r, i_state);
load_sc(&const_v, 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
for (size_t i = 1; i < NUM_STEPS / 2; i++)
{
msg_exp_even(i_state);
msg_add_even(cv_l, cv_r, i_state);
load_sc(&const_v, 16 * i);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
msg_exp_odd(i_state);
msg_add_odd(cv_l, cv_r, i_state);
load_sc(&const_v, 16 * i + 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
}
msg_exp_even(i_state);
msg_add_even(cv_l, cv_r, i_state);
}
/* -------------------------------------------------------- */
inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16])
{
// The IV's are 32-byte aligned so we can use aligned loads.
_mm256_storeu_si256(M256_CAST(cv_l+0),
_mm256_load_si256(CONST_M256_CAST(iv+0)));
_mm256_storeu_si256(M256_CAST(cv_l+4),
_mm256_load_si256(CONST_M256_CAST(iv+4)));
_mm256_storeu_si256(M256_CAST(cv_r+0),
_mm256_load_si256(CONST_M256_CAST(iv+8)));
_mm256_storeu_si256(M256_CAST(cv_r+4),
_mm256_load_si256(CONST_M256_CAST(iv+12)));
}
inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
{
_mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_setzero_si256());
}
inline void zero_submsgs(LSH512_AVX2_Context* ctx)
{
lsh_u64* sub_msgs = ctx->sub_msgs;
_mm256_storeu_si256(M256_CAST(sub_msgs+ 0),
_mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(sub_msgs+ 4),
_mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(sub_msgs+ 8),
_mm256_setzero_si256());
_mm256_storeu_si256(M256_CAST(sub_msgs+12),
_mm256_setzero_si256());
}
inline void init224(LSH512_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224);
}
inline void init256(LSH512_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256);
}
inline void init384(LSH512_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384);
}
inline void init512(LSH512_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512);
}
/* -------------------------------------------------------- */
inline void fin(LSH512_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
_mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
_mm256_storeu_si256(M256_CAST(ctx->cv_l+4), _mm256_xor_si256(
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+4)),
_mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+4))));
}
/* -------------------------------------------------------- */
inline void get_hash(LSH512_AVX2_Context* ctx, lsh_u8* pbHashVal)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
lsh_uint alg_type = ctx->alg_type;
lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
// Multiplying by sizeof(lsh_u8) looks odd...
memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
if (hash_val_bit_len){
pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
}
}
/* -------------------------------------------------------- */
lsh_err lsh512_init_avx2(LSH512_AVX2_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
lsh_u32 alg_type = ctx->alg_type;
const lsh_u64* const_v = NULL;
ctx->remain_databitlen = 0;
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
AVX_Cleanup cleanup;
switch (alg_type){
case LSH_TYPE_512_512:
init512(ctx);
return LSH_SUCCESS;
case LSH_TYPE_512_384:
init384(ctx);
return LSH_SUCCESS;
case LSH_TYPE_512_256:
init256(ctx);
return LSH_SUCCESS;
case LSH_TYPE_512_224:
init224(ctx);
return LSH_SUCCESS;
default:
break;
}
lsh_u64* cv_l = ctx->cv_l;
lsh_u64* cv_r = ctx->cv_r;
zero_iv(cv_l, cv_r);
cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN;
cv_l[1] = LSH_GET_HASHBIT(alg_type);
for (size_t i = 0; i < NUM_STEPS / 2; i++)
{
//Mix
load_sc(&const_v, i * 16);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
load_sc(&const_v, i * 16 + 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
}
return LSH_SUCCESS;
}
lsh_err lsh512_update_avx2(LSH512_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(data != NULLPTR);
CRYPTOPP_ASSERT(databitlen % 8 == 0);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
AVX_Cleanup cleanup;
if (databitlen == 0){
return LSH_SUCCESS;
}
// We are byte oriented. tail bits will always be 0.
size_t databytelen = databitlen >> 3;
// lsh_uint pos2 = databitlen & 0x7;
const size_t pos2 = 0;
size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
const size_t remain_msg_bit = 0;
if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
return LSH_ERR_INVALID_STATE;
}
if (remain_msg_bit > 0){
return LSH_ERR_INVALID_DATABITLEN;
}
if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){
memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
ctx->remain_databitlen += (lsh_uint)databitlen;
remain_msg_byte += (lsh_uint)databytelen;
if (pos2){
ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
}
return LSH_SUCCESS;
}
if (remain_msg_byte > 0){
size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte;
memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
compress(ctx, ctx->last_block);
data += more_byte;
databytelen -= more_byte;
remain_msg_byte = 0;
ctx->remain_databitlen = 0;
}
while (databytelen >= LSH512_MSG_BLK_BYTE_LEN)
{
// This call to compress caused some trouble.
// The data pointer can become unaligned in the
// previous block.
compress(ctx, data);
data += LSH512_MSG_BLK_BYTE_LEN;
databytelen -= LSH512_MSG_BLK_BYTE_LEN;
}
if (databytelen > 0){
memcpy(ctx->last_block, data, databytelen);
ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
}
if (pos2){
ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
ctx->remain_databitlen += pos2;
}
return LSH_SUCCESS;
}
lsh_err lsh512_final_avx2(LSH512_AVX2_Context* ctx, lsh_u8* hashval)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(hashval != NULLPTR);
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
AVX_Cleanup cleanup;
// We are byte oriented. tail bits will always be 0.
size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
const size_t remain_msg_bit = 0;
if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
return LSH_ERR_INVALID_STATE;
}
if (remain_msg_bit){
ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
}
else{
ctx->last_block[remain_msg_byte] = 0x80;
}
memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
compress(ctx, ctx->last_block);
fin(ctx);
get_hash(ctx, hashval);
return LSH_SUCCESS;
}
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
extern
void LSH512_Base_Restart_AVX2(word64* state)
{
state[RemainingBits] = 0;
LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh512_init_avx2(&ctx);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_avx2 failed");
}
extern
void LSH512_Base_Update_AVX2(word64* state, const byte *input, size_t size)
{
LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh512_update_avx2(&ctx, input, 8*size);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_avx2 failed");
}
extern
void LSH512_Base_TruncatedFinal_AVX2(word64* state, byte *hash, size_t)
{
LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh512_final_avx2(&ctx, hash);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_avx2 failed");
}
NAMESPACE_END
#endif // CRYPTOPP_AVX2_AVAILABLE

937
lsh512_sse.cpp Normal file
View File

@ -0,0 +1,937 @@
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
// Based on the specification and source code provided by
// Korea Internet & Security Agency (KISA) website. Also
// see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
// and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
// makes using zeroupper a little tricky.
#include "pch.h"
#include "config.h"
#include "lsh.h"
#include "misc.h"
#if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
# include <emmintrin.h>
# include <tmmintrin.h>
#endif
#if defined(CRYPTOPP_XOP_AVAILABLE)
# include <ammintrin.h>
#endif
#if defined(__GNUC__) && defined(__amd64__)
# include <x86intrin.h>
#endif
ANONYMOUS_NAMESPACE_BEGIN
/* LSH Constants */
const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256;
// const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048;
// const unsigned int LSH512_CV_BYTE_LEN = 128;
const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64;
// const unsigned int MSG_BLK_WORD_LEN = 32;
const unsigned int CV_WORD_LEN = 16;
const unsigned int CONST_WORD_LEN = 8;
const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
const unsigned int NUM_STEPS = 28;
const unsigned int ROT_EVEN_ALPHA = 23;
const unsigned int ROT_EVEN_BETA = 59;
const unsigned int ROT_ODD_ALPHA = 7;
const unsigned int ROT_ODD_BETA = 3;
const unsigned int LSH_TYPE_512_512 = 0x0010040;
const unsigned int LSH_TYPE_512_384 = 0x0010030;
const unsigned int LSH_TYPE_512_256 = 0x0010020;
const unsigned int LSH_TYPE_512_224 = 0x001001C;
// const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384;
// const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512;
/* Error Code */
const unsigned int LSH_SUCCESS = 0x0;
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
/* Index into our state array */
const unsigned int AlgorithmType = 80;
const unsigned int RemainingBits = 81;
NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(LSH)
// lsh512.cpp
extern const word64 LSH512_IV224[CV_WORD_LEN];
extern const word64 LSH512_IV256[CV_WORD_LEN];
extern const word64 LSH512_IV384[CV_WORD_LEN];
extern const word64 LSH512_IV512[CV_WORD_LEN];
extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS];
NAMESPACE_END // LSH
NAMESPACE_END // Crypto++
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::byte;
using CryptoPP::word32;
using CryptoPP::word64;
using CryptoPP::rotlFixed;
using CryptoPP::rotlConstant;
using CryptoPP::GetBlock;
using CryptoPP::LittleEndian;
using CryptoPP::ConditionalByteReverse;
using CryptoPP::LITTLE_ENDIAN_ORDER;
using CryptoPP::LSH::LSH512_IV224;
using CryptoPP::LSH::LSH512_IV256;
using CryptoPP::LSH::LSH512_IV384;
using CryptoPP::LSH::LSH512_IV512;
using CryptoPP::LSH::LSH512_StepConstants;
typedef byte lsh_u8;
typedef word32 lsh_u32;
typedef word64 lsh_u64;
typedef word32 lsh_uint;
typedef word32 lsh_err;
typedef word32 lsh_type;
struct LSH512_SSSE3_Context
{
LSH512_SSSE3_Context(word64* state, word64 algType, word64& remainingBitLength) :
cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
last_block(reinterpret_cast<byte*>(state+48)),
remain_databitlen(remainingBitLength),
alg_type(static_cast<lsh_type>(algType)) {}
lsh_u64* cv_l; // start of our state block
lsh_u64* cv_r;
lsh_u64* sub_msgs;
lsh_u8* last_block;
lsh_u64& remain_databitlen;
lsh_type alg_type;
};
struct LSH512_SSSE3_Internal
{
LSH512_SSSE3_Internal(word64* state) :
submsg_e_l(state+16), submsg_e_r(state+24),
submsg_o_l(state+32), submsg_o_r(state+40) { }
lsh_u64* submsg_e_l; /* even left sub-message */
lsh_u64* submsg_e_r; /* even right sub-message */
lsh_u64* submsg_o_l; /* odd left sub-message */
lsh_u64* submsg_o_r; /* odd right sub-message */
};
const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
/* LSH AlgType Macro */
inline bool LSH_IS_LSH512(lsh_uint val) {
return (val & 0xf0000) == 0x10000;
}
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
return val >> 24;
}
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
return val & 0xffff;
}
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
}
inline lsh_u64 loadLE64(lsh_u64 v) {
return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
}
lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
return rotlFixed(x, r);
}
// Original code relied upon unaligned lsh_u64 buffer
inline void load_msg_blk(LSH512_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN])
{
lsh_u64* submsg_e_l = i_state->submsg_e_l;
lsh_u64* submsg_e_r = i_state->submsg_e_r;
lsh_u64* submsg_o_l = i_state->submsg_o_l;
lsh_u64* submsg_o_r = i_state->submsg_o_r;
_mm_storeu_si128(M128_CAST(submsg_e_l+0),
_mm_loadu_si128(CONST_M128_CAST(msgblk+0)));
_mm_storeu_si128(M128_CAST(submsg_e_l+2),
_mm_loadu_si128(CONST_M128_CAST(msgblk+16)));
_mm_storeu_si128(M128_CAST(submsg_e_l+4),
_mm_loadu_si128(CONST_M128_CAST(msgblk+32)));
_mm_storeu_si128(M128_CAST(submsg_e_l+6),
_mm_loadu_si128(CONST_M128_CAST(msgblk+48)));
_mm_storeu_si128(M128_CAST(submsg_e_r+0),
_mm_loadu_si128(CONST_M128_CAST(msgblk+64)));
_mm_storeu_si128(M128_CAST(submsg_e_r+2),
_mm_loadu_si128(CONST_M128_CAST(msgblk+80)));
_mm_storeu_si128(M128_CAST(submsg_e_r+4),
_mm_loadu_si128(CONST_M128_CAST(msgblk+96)));
_mm_storeu_si128(M128_CAST(submsg_e_r+6),
_mm_loadu_si128(CONST_M128_CAST(msgblk+112)));
_mm_storeu_si128(M128_CAST(submsg_o_l+0),
_mm_loadu_si128(CONST_M128_CAST(msgblk+128)));
_mm_storeu_si128(M128_CAST(submsg_o_l+2),
_mm_loadu_si128(CONST_M128_CAST(msgblk+144)));
_mm_storeu_si128(M128_CAST(submsg_o_l+4),
_mm_loadu_si128(CONST_M128_CAST(msgblk+160)));
_mm_storeu_si128(M128_CAST(submsg_o_l+6),
_mm_loadu_si128(CONST_M128_CAST(msgblk+176)));
_mm_storeu_si128(M128_CAST(submsg_o_r+0),
_mm_loadu_si128(CONST_M128_CAST(msgblk+192)));
_mm_storeu_si128(M128_CAST(submsg_o_r+2),
_mm_loadu_si128(CONST_M128_CAST(msgblk+208)));
_mm_storeu_si128(M128_CAST(submsg_o_r+4),
_mm_loadu_si128(CONST_M128_CAST(msgblk+224)));
_mm_storeu_si128(M128_CAST(submsg_o_r+6),
_mm_loadu_si128(CONST_M128_CAST(msgblk+240)));
}
inline void msg_exp_even(LSH512_SSSE3_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u64* submsg_e_l = i_state->submsg_e_l;
lsh_u64* submsg_e_r = i_state->submsg_e_r;
lsh_u64* submsg_o_l = i_state->submsg_o_l;
lsh_u64* submsg_o_r = i_state->submsg_o_r;
__m128i temp;
_mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), _MM_SHUFFLE(1,0,3,2)));
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0));
_mm_storeu_si128(M128_CAST(submsg_e_l+0),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)));
_mm_storeu_si128(M128_CAST(submsg_e_l+2), temp);
_mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), _MM_SHUFFLE(1,0,3,2)));
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4));
_mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_unpacklo_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
_mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_unpackhi_epi64(
temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6))));
_mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)), _MM_SHUFFLE(1,0,3,2)));
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0));
_mm_storeu_si128(M128_CAST(submsg_e_r+0),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)));
_mm_storeu_si128(M128_CAST(submsg_e_r+2), temp);
_mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), _MM_SHUFFLE(1,0,3,2)));
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4));
_mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_unpacklo_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
_mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_unpackhi_epi64(
temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6))));
_mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0))));
_mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2))));
_mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
_mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6))));
_mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0))));
_mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2))));
_mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
_mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6))));
}
inline void msg_exp_odd(LSH512_SSSE3_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u64* submsg_e_l = i_state->submsg_e_l;
lsh_u64* submsg_e_r = i_state->submsg_e_r;
lsh_u64* submsg_o_l = i_state->submsg_o_l;
lsh_u64* submsg_o_r = i_state->submsg_o_r;
__m128i temp;
_mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), _MM_SHUFFLE(1,0,3,2)));
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0));
_mm_storeu_si128(M128_CAST(submsg_o_l+0),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)));
_mm_storeu_si128(M128_CAST(submsg_o_l+2), temp);
_mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), _MM_SHUFFLE(1,0,3,2)));
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4));
_mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_unpacklo_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
_mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_unpackhi_epi64(
temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6))));
_mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)), _MM_SHUFFLE(1,0,3,2)));
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0));
_mm_storeu_si128(M128_CAST(submsg_o_r+0),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)));
_mm_storeu_si128(M128_CAST(submsg_o_r+2), temp);
_mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), _MM_SHUFFLE(1,0,3,2)));
temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4));
_mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_unpacklo_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
_mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_unpackhi_epi64(
temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6))));
_mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0))));
_mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2))));
_mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
_mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6))));
_mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0))));
_mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2))));
_mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
_mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6))));
}
inline void load_sc(const lsh_u64** p_const_v, size_t i)
{
*p_const_v = &LSH512_StepConstants[i];
}
inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_SSSE3_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u64* submsg_e_l = i_state->submsg_e_l;
lsh_u64* submsg_e_r = i_state->submsg_e_r;
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l))));
_mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r))));
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2))));
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2))));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6))));
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6))));
}
inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_SSSE3_Internal* i_state)
{
CRYPTOPP_ASSERT(i_state != NULLPTR);
lsh_u64* submsg_o_l = i_state->submsg_o_l;
lsh_u64* submsg_o_r = i_state->submsg_o_r;
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l))));
_mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r))));
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2))));
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2))));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6))));
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)),
_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6))));
}
inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
{
_mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
_mm_loadu_si128(CONST_M128_CAST(cv_r))));
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
_mm_loadu_si128(CONST_M128_CAST(cv_r+2))));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(cv_r+4))));
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_add_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
_mm_loadu_si128(CONST_M128_CAST(cv_r+6))));
}
template <unsigned int R>
inline void rotate_blk(lsh_u64 cv[8])
{
#if defined(CRYPTOPP_XOP_AVAILABLE)
_mm_storeu_si128(M128_CAST(cv),
_mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R));
_mm_storeu_si128(M128_CAST(cv+2),
_mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R));
_mm_storeu_si128(M128_CAST(cv+4),
_mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R));
_mm_storeu_si128(M128_CAST(cv+6),
_mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R));
#else
_mm_storeu_si128(M128_CAST(cv), _mm_or_si128(
_mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R),
_mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), 64-R)));
_mm_storeu_si128(M128_CAST(cv+2), _mm_or_si128(
_mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R),
_mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), 64-R)));
_mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128(
_mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R),
_mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 64-R)));
_mm_storeu_si128(M128_CAST(cv+6), _mm_or_si128(
_mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R),
_mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), 64-R)));
#endif
}
inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8])
{
_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l)),
_mm_loadu_si128(CONST_M128_CAST(const_v))));
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
_mm_loadu_si128(CONST_M128_CAST(const_v+2))));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(const_v+4))));
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
_mm_loadu_si128(CONST_M128_CAST(const_v+6))));
}
inline void rotate_msg_gamma(lsh_u64 cv_r[8])
{
// g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
_mm_storeu_si128(M128_CAST(cv_r+0),
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
_mm_set_epi8(13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0)));
_mm_storeu_si128(M128_CAST(cv_r+2),
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+2)),
_mm_set_epi8(9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4)));
_mm_storeu_si128(M128_CAST(cv_r+4),
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
_mm_set_epi8(12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7)));
_mm_storeu_si128(M128_CAST(cv_r+6),
_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+6)),
_mm_set_epi8(8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3)));
}
inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
{
__m128i temp[2];
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0));
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_unpacklo_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_l+2)),
_mm_loadu_si128(CONST_M128_CAST(cv_l+0))));
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_unpackhi_epi64(
temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+2))));
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+4));
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_unpacklo_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)),
_mm_loadu_si128(CONST_M128_CAST(cv_l+4))));
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_unpackhi_epi64(
temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+6))));
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)), _MM_SHUFFLE(1,0,3,2)));
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+0));
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_unpacklo_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
_mm_loadu_si128(CONST_M128_CAST(cv_r+2))));
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_unpackhi_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)), temp[0]));
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_shuffle_epi32(
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)), _MM_SHUFFLE(1,0,3,2)));
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+4));
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_unpacklo_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
_mm_loadu_si128(CONST_M128_CAST(cv_r+6))));
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_unpackhi_epi64(
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)), temp[0]));
temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0));
temp[1] = _mm_loadu_si128(CONST_M128_CAST(cv_l+2));
_mm_storeu_si128(M128_CAST(cv_l+0),
_mm_loadu_si128(CONST_M128_CAST(cv_l+4)));
_mm_storeu_si128(M128_CAST(cv_l+2),
_mm_loadu_si128(CONST_M128_CAST(cv_l+6)));
_mm_storeu_si128(M128_CAST(cv_l+4),
_mm_loadu_si128(CONST_M128_CAST(cv_r+4)));
_mm_storeu_si128(M128_CAST(cv_l+6),
_mm_loadu_si128(CONST_M128_CAST(cv_r+6)));
_mm_storeu_si128(M128_CAST(cv_r+4),
_mm_loadu_si128(CONST_M128_CAST(cv_r+0)));
_mm_storeu_si128(M128_CAST(cv_r+6),
_mm_loadu_si128(CONST_M128_CAST(cv_r+2)));
_mm_storeu_si128(M128_CAST(cv_r+0), temp[0]);
_mm_storeu_si128(M128_CAST(cv_r+2), temp[1]);
};
/* -------------------------------------------------------- *
* step function
* -------------------------------------------------------- */
template <unsigned int Alpha, unsigned int Beta>
inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8])
{
add_blk(cv_l, cv_r);
rotate_blk<Alpha>(cv_l);
xor_with_const(cv_l, const_v);
add_blk(cv_r, cv_l);
rotate_blk<Beta>(cv_r);
add_blk(cv_l, cv_r);
rotate_msg_gamma(cv_r);
}
/* -------------------------------------------------------- *
* compression function
* -------------------------------------------------------- */
inline void compress(LSH512_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN])
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
LSH512_SSSE3_Internal s_state(ctx->cv_l);
LSH512_SSSE3_Internal* i_state = &s_state;
const lsh_u64* const_v = NULL;
lsh_u64 *cv_l = ctx->cv_l;
lsh_u64 *cv_r = ctx->cv_r;
load_msg_blk(i_state, pdMsgBlk);
msg_add_even(cv_l, cv_r, i_state);
load_sc(&const_v, 0);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
msg_add_odd(cv_l, cv_r, i_state);
load_sc(&const_v, 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
for (size_t i = 1; i < NUM_STEPS / 2; i++)
{
msg_exp_even(i_state);
msg_add_even(cv_l, cv_r, i_state);
load_sc(&const_v, 16 * i);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
msg_exp_odd(i_state);
msg_add_odd(cv_l, cv_r, i_state);
load_sc(&const_v, 16 * i + 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
}
msg_exp_even(i_state);
msg_add_even(cv_l, cv_r, i_state);
}
/* -------------------------------------------------------- */
inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16])
{
// The IV's are 32-byte aligned so we can use aligned loads.
_mm_storeu_si128(M128_CAST(cv_l+0),
_mm_load_si128(CONST_M128_CAST(iv+0)));
_mm_storeu_si128(M128_CAST(cv_l+2),
_mm_load_si128(CONST_M128_CAST(iv+2)));
_mm_storeu_si128(M128_CAST(cv_l+4),
_mm_load_si128(CONST_M128_CAST(iv+4)));
_mm_storeu_si128(M128_CAST(cv_l+6),
_mm_load_si128(CONST_M128_CAST(iv+6)));
_mm_storeu_si128(M128_CAST(cv_r+0),
_mm_load_si128(CONST_M128_CAST(iv+8)));
_mm_storeu_si128(M128_CAST(cv_r+2),
_mm_load_si128(CONST_M128_CAST(iv+10)));
_mm_storeu_si128(M128_CAST(cv_r+4),
_mm_load_si128(CONST_M128_CAST(iv+12)));
_mm_storeu_si128(M128_CAST(cv_r+6),
_mm_load_si128(CONST_M128_CAST(iv+14)));
}
inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
{
_mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_l+2), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_l+6), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_r+2), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128());
_mm_storeu_si128(M128_CAST(cv_r+6), _mm_setzero_si128());
}
inline void zero_submsgs(LSH512_SSSE3_Context* ctx)
{
lsh_u64* sub_msgs = ctx->sub_msgs;
_mm_storeu_si128(M128_CAST(sub_msgs+ 0),
_mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+ 2),
_mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+ 4),
_mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+ 6),
_mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+ 8),
_mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+10),
_mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+12),
_mm_setzero_si128());
_mm_storeu_si128(M128_CAST(sub_msgs+14),
_mm_setzero_si128());
}
inline void init224(LSH512_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224);
}
inline void init256(LSH512_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256);
}
inline void init384(LSH512_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384);
}
inline void init512(LSH512_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
zero_submsgs(ctx);
load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512);
}
/* -------------------------------------------------------- */
inline void fin(LSH512_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
_mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)),
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0))));
_mm_storeu_si128(M128_CAST(ctx->cv_l+2), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+2)),
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+2))));
_mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)),
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4))));
_mm_storeu_si128(M128_CAST(ctx->cv_l+6), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+6)),
_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+6))));
}
/* -------------------------------------------------------- */
inline void get_hash(LSH512_SSSE3_Context* ctx, lsh_u8* pbHashVal)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
lsh_uint alg_type = ctx->alg_type;
lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
// Multiplying by sizeof(lsh_u8) looks odd...
memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
if (hash_val_bit_len){
pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
}
}
/* -------------------------------------------------------- */
lsh_err lsh512_init_ssse3(LSH512_SSSE3_Context* ctx)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
lsh_u32 alg_type = ctx->alg_type;
const lsh_u64* const_v = NULL;
ctx->remain_databitlen = 0;
switch (alg_type){
case LSH_TYPE_512_512:
init512(ctx);
return LSH_SUCCESS;
case LSH_TYPE_512_384:
init384(ctx);
return LSH_SUCCESS;
case LSH_TYPE_512_256:
init256(ctx);
return LSH_SUCCESS;
case LSH_TYPE_512_224:
init224(ctx);
return LSH_SUCCESS;
default:
break;
}
lsh_u64* cv_l = ctx->cv_l;
lsh_u64* cv_r = ctx->cv_r;
zero_iv(cv_l, cv_r);
cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN;
cv_l[1] = LSH_GET_HASHBIT(alg_type);
for (size_t i = 0; i < NUM_STEPS / 2; i++)
{
//Mix
load_sc(&const_v, i * 16);
mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
load_sc(&const_v, i * 16 + 8);
mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
word_perm(cv_l, cv_r);
}
return LSH_SUCCESS;
}
lsh_err lsh512_update_ssse3(LSH512_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(data != NULLPTR);
CRYPTOPP_ASSERT(databitlen % 8 == 0);
CRYPTOPP_ASSERT(ctx->alg_type != 0);
if (databitlen == 0){
return LSH_SUCCESS;
}
// We are byte oriented. tail bits will always be 0.
size_t databytelen = databitlen >> 3;
// lsh_uint pos2 = databitlen & 0x7;
const size_t pos2 = 0;
size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
const size_t remain_msg_bit = 0;
if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
return LSH_ERR_INVALID_STATE;
}
if (remain_msg_bit > 0){
return LSH_ERR_INVALID_DATABITLEN;
}
if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){
memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
ctx->remain_databitlen += (lsh_uint)databitlen;
remain_msg_byte += (lsh_uint)databytelen;
if (pos2){
ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
}
return LSH_SUCCESS;
}
if (remain_msg_byte > 0){
size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte;
memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
compress(ctx, ctx->last_block);
data += more_byte;
databytelen -= more_byte;
remain_msg_byte = 0;
ctx->remain_databitlen = 0;
}
while (databytelen >= LSH512_MSG_BLK_BYTE_LEN)
{
// This call to compress caused some trouble.
// The data pointer can become unaligned in the
// previous block.
compress(ctx, data);
data += LSH512_MSG_BLK_BYTE_LEN;
databytelen -= LSH512_MSG_BLK_BYTE_LEN;
}
if (databytelen > 0){
memcpy(ctx->last_block, data, databytelen);
ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
}
if (pos2){
ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
ctx->remain_databitlen += pos2;
}
return LSH_SUCCESS;
}
lsh_err lsh512_final_ssse3(LSH512_SSSE3_Context* ctx, lsh_u8* hashval)
{
CRYPTOPP_ASSERT(ctx != NULLPTR);
CRYPTOPP_ASSERT(hashval != NULLPTR);
// We are byte oriented. tail bits will always be 0.
size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
const size_t remain_msg_bit = 0;
if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
return LSH_ERR_INVALID_STATE;
}
if (remain_msg_bit){
ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
}
else{
ctx->last_block[remain_msg_byte] = 0x80;
}
memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
compress(ctx, ctx->last_block);
fin(ctx);
get_hash(ctx, hashval);
return LSH_SUCCESS;
}
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
extern
void LSH512_Base_Restart_SSSE3(word64* state)
{
state[RemainingBits] = 0;
LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh512_init_ssse3(&ctx);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_ssse3 failed");
}
extern
void LSH512_Base_Update_SSSE3(word64* state, const byte *input, size_t size)
{
LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh512_update_ssse3(&ctx, input, 8*size);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_ssse3 failed");
}
extern
void LSH512_Base_TruncatedFinal_SSSE3(word64* state, byte *hash, size_t)
{
LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
lsh_err err = lsh512_final_ssse3(&ctx, hash);
if (err != LSH_SUCCESS)
throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_ssse3 failed");
}
NAMESPACE_END
#endif // CRYPTOPP_SSSE3_AVAILABLE