From dd7598e638bba536117de716bae3f738312d4c5a Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 7 Jul 2020 15:22:09 -0400 Subject: [PATCH] Remove 64-bit AdvancedProcessBlocks (GH #945) --- Filelist.txt | 3 - GNUmakefile | 24 - GNUmakefile-cross | 21 - adv_simd.h | 1116 +------------------------------------- cham.cpp | 34 +- cham.h | 19 +- cham_simd.cpp | 608 --------------------- cryptest.nmake | 12 +- cryptlib.vcxproj | 3 - cryptlib.vcxproj.filters | 9 - simeck.cpp | 40 -- simeck.h | 13 - simeck_simd.cpp | 342 ------------ simon.cpp | 121 +---- simon.h | 18 - simon64_simd.cpp | 864 ----------------------------- speck.cpp | 109 ---- speck.h | 18 - speck64_simd.cpp | 781 -------------------------- 19 files changed, 25 insertions(+), 4130 deletions(-) delete mode 100644 simeck_simd.cpp delete mode 100644 simon64_simd.cpp delete mode 100644 speck64_simd.cpp diff --git a/Filelist.txt b/Filelist.txt index 06cefca7..8f10b2a8 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -334,10 +334,8 @@ simple.cpp simple.h siphash.h simeck.cpp -simeck_simd.cpp simeck.h simon.cpp -simon64_simd.cpp simon128_simd.cpp simon.h skipjack.cpp @@ -351,7 +349,6 @@ smartptr.h sosemanuk.cpp sosemanuk.h speck.cpp -speck64_simd.cpp speck128_simd.cpp speck.h square.cpp diff --git a/GNUmakefile b/GNUmakefile index bd636c57..e52d3417 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -292,7 +292,6 @@ ifeq ($(DETECT_FEATURES),1) CHAM_FLAG = $(SSSE3_FLAG) KECCAK_FLAG = $(SSSE3_FLAG) LEA_FLAG = $(SSSE3_FLAG) - SIMECK_FLAG = $(SSSE3_FLAG) SIMON128_FLAG = $(SSSE3_FLAG) SPECK128_FLAG = $(SSSE3_FLAG) SUN_LDFLAGS += $(SSSE3_FLAG) @@ -306,8 +305,6 @@ ifeq ($(DETECT_FEATURES),1) ifeq ($(strip $(HAVE_OPT)),0) BLAKE2B_FLAG = $(SSE41_FLAG) BLAKE2S_FLAG = $(SSE41_FLAG) - SIMON64_FLAG = $(SSE41_FLAG) - SPECK64_FLAG = $(SSE41_FLAG) SUN_LDFLAGS += $(SSE41_FLAG) else SSE41_FLAG = @@ -478,10 +475,7 @@ ifeq ($(DETECT_FEATURES),1) CHAM_FLAG = -march=armv7-a -mfpu=neon LEA_FLAG = -march=armv7-a -mfpu=neon SHA_FLAG = -march=armv7-a -mfpu=neon - SIMECK_FLAG = -march=armv7-a -mfpu=neon - SIMON64_FLAG = -march=armv7-a -mfpu=neon SIMON128_FLAG = -march=armv7-a -mfpu=neon - SPECK64_FLAG = -march=armv7-a -mfpu=neon SPECK128_FLAG = -march=armv7-a -mfpu=neon SM4_FLAG = -march=armv7-a -mfpu=neon else @@ -521,10 +515,7 @@ ifeq ($(DETECT_FEATURES),1) CHAM_FLAG = -march=armv8-a LEA_FLAG = -march=armv8-a NEON_FLAG = -march=armv8-a - SIMECK_FLAG = -march=armv8-a - SIMON64_FLAG = -march=armv8-a SIMON128_FLAG = -march=armv8-a - SPECK64_FLAG = -march=armv8-a SPECK128_FLAG = -march=armv8-a SM4_FLAG = -march=armv8-a else @@ -658,7 +649,6 @@ ifeq ($(DETECT_FEATURES),1) LEA_FLAG = $(POWER8_FLAG) SHA_FLAG = $(POWER8_FLAG) SHACAL2_FLAG = $(POWER8_FLAG) - SIMECK_FLAG = $(POWER8_FLAG) else POWER8_FLAG = endif @@ -724,8 +714,6 @@ ifeq ($(DETECT_FEATURES),1) ifneq ($(ALTIVEC_FLAG),) BLAKE2S_FLAG = $(ALTIVEC_FLAG) CHACHA_FLAG = $(ALTIVEC_FLAG) - SIMON64_FLAG = $(ALTIVEC_FLAG) - SPECK64_FLAG = $(ALTIVEC_FLAG) SPECK128_FLAG = $(ALTIVEC_FLAG) SIMON128_FLAG = $(ALTIVEC_FLAG) endif @@ -1612,22 +1600,10 @@ sha3_simd.o : sha3_simd.cpp shacal2_simd.o : shacal2_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SHA_FLAG) -c) $< -# SSSE3 or NEON available -simeck_simd.o : simeck_simd.cpp - $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMECK_FLAG) -c) $< - -# SSE4.1, NEON or POWER7 available -simon64_simd.o : simon64_simd.cpp - $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON64_FLAG) -c) $< - # SSSE3, NEON or POWER8 available simon128_simd.o : simon128_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON128_FLAG) -c) $< -# SSE4.1, NEON or POWER7 available -speck64_simd.o : speck64_simd.cpp - $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK64_FLAG) -c) $< - # SSSE3, NEON or POWER8 available speck128_simd.o : speck128_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK128_FLAG) -c) $< diff --git a/GNUmakefile-cross b/GNUmakefile-cross index 97f92f41..13b8970c 100755 --- a/GNUmakefile-cross +++ b/GNUmakefile-cross @@ -241,7 +241,6 @@ ifeq ($(DETECT_FEATURES),1) ARIA_FLAG = $(SSSE3_FLAG) CHAM_FLAG = $(SSSE3_FLAG) LEA_FLAG = $(SSSE3_FLAG) - SIMECK_FLAG = $(SSSE3_FLAG) SIMON128_FLAG = $(SSSE3_FLAG) SPECK128_FLAG = $(SSSE3_FLAG) else @@ -254,8 +253,6 @@ ifeq ($(DETECT_FEATURES),1) ifeq ($(strip $(HAVE_OPT)),0) BLAKE2B_FLAG = $(SSE41_FLAG) BLAKE2S_FLAG = $(SSE41_FLAG) - SIMON64_FLAG = $(SSE41_FLAG) - SPECK64_FLAG = $(SSE41_FLAG) else SSE41_FLAG = endif @@ -400,10 +397,7 @@ ifeq ($(DETECT_FEATURES),1) CHAM_FLAG = $(NEON_FLAG) LEA_FLAG = $(NEON_FLAG) SHA_FLAG = $(NEON_FLAG) - SIMECK_FLAG = $(NEON_FLAG) - SIMON64_FLAG = $(NEON_FLAG) SIMON128_FLAG = $(NEON_FLAG) - SPECK64_FLAG = $(NEON_FLAG) SPECK128_FLAG = $(NEON_FLAG) SM4_FLAG = $(NEON_FLAG) else @@ -457,10 +451,7 @@ ifeq ($(DETECT_FEATURES),1) CHAM_FLAG = $(ASIMD_FLAG) LEA_FLAG = $(ASIMD_FLAG) NEON_FLAG = $(ASIMD_FLAG) - SIMECK_FLAG = $(ASIMD_FLAG) - SIMON64_FLAG = $(ASIMD_FLAG) SIMON128_FLAG = $(ASIMD_FLAG) - SPECK64_FLAG = $(ASIMD_FLAG) SPECK128_FLAG = $(ASIMD_FLAG) SM4_FLAG = $(ASIMD_FLAG) else @@ -933,22 +924,10 @@ sha512_armv4.o : sha512_armv4.S shacal2_simd.o : shacal2_simd.cpp $(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $< -# SSSE3 or NEON available -simeck_simd.o : simeck_simd.cpp - $(CXX) $(strip $(CXXFLAGS) $(SIMECK_FLAG) -c) $< - -# SSE4.1, NEON or POWER7 available -simon64_simd.o : simon64_simd.cpp - $(CXX) $(strip $(CXXFLAGS) $(SIMON64_FLAG) -c) $< - # SSSE3, NEON or POWER8 available simon128_simd.o : simon128_simd.cpp $(CXX) $(strip $(CXXFLAGS) $(SIMON128_FLAG) -c) $< -# SSE4.1, NEON or POWER7 available -speck64_simd.o : speck64_simd.cpp - $(CXX) $(strip $(CXXFLAGS) $(SPECK64_FLAG) -c) $< - # SSSE3, NEON or POWER8 available speck128_simd.o : speck128_simd.cpp $(CXX) $(strip $(CXXFLAGS) $(SPECK128_FLAG) -c) $< diff --git a/adv_simd.h b/adv_simd.h index 27e4cd43..134a8210 100644 --- a/adv_simd.h +++ b/adv_simd.h @@ -9,27 +9,16 @@ // acceleration. After several implementations we noticed a lot of copy and // paste occuring. adv_simd.h provides a template to avoid the copy and paste. // -// There are 11 templates provided in this file. The number following the -// function name, 64 or 128, is the block size. The name following the block -// size is the arrangement and acceleration. For example 4x1_SSE means Intel -// SSE using two encrypt (or decrypt) functions: one that operates on 4 SIMD -// words, and one that operates on 1 SIMD words. +// There are 6 templates provided in this file. The number following the +// function name, 128, is the block size in bits. The name following the +// block size is the arrangement and acceleration. For example 4x1_SSE means +// Intel SSE using two encrypt (or decrypt) functions: one that operates on +// 4 SIMD words, and one that operates on 1 SIMD words. // -// The distinction between SIMD words versus cipher blocks is important -// because 64-bit ciphers use one SIMD word for two cipher blocks. For -// example, AdvancedProcessBlocks64_6x2_ALTIVEC operates on 6 and 2 SIMD -// words, which is 12 and 4 cipher blocks. The function will do the right -// thing even if there is only one 64-bit block to encrypt. -// -// * AdvancedProcessBlocks64_2x1_SSE -// * AdvancedProcessBlocks64_4x1_SSE // * AdvancedProcessBlocks128_4x1_SSE -// * AdvancedProcessBlocks64_6x2_SSE // * AdvancedProcessBlocks128_6x2_SSE -// * AdvancedProcessBlocks64_6x2_NEON // * AdvancedProcessBlocks128_4x1_NEON // * AdvancedProcessBlocks128_6x2_NEON -// * AdvancedProcessBlocks64_6x2_ALTIVEC // * AdvancedProcessBlocks128_4x1_ALTIVEC // * AdvancedProcessBlocks128_6x1_ALTIVEC // @@ -41,6 +30,10 @@ // The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions // of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually // results in a failed link due to the const/non-const mismatch. +// +// In July 2020 the library stopped using 64-bit block version of +// AdvancedProcessBlocks. Testing showed unreliable results and failed +// self tests on occassion. #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES @@ -94,247 +87,6 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) -/// \brief AdvancedProcessBlocks for 2 and 6 blocks -/// \tparam F2 function to process 2 64-bit blocks -/// \tparam F6 function to process 6 64-bit blocks -/// \tparam W word type of the subkey table -/// \details AdvancedProcessBlocks64_6x2_NEON processes 6 and 2 NEON SIMD words -/// at a time. For a single block the template uses F2 with a zero block. -/// \details The subkey type is usually word32 or word64. F2 and F6 must use the -/// same word type. -template -inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, - const W *subKeys, size_t rounds, const byte *inBlocks, - const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 8); - - const unsigned int w_one[] = {0, 0<<24, 0, 1<<24}; - const unsigned int w_two[] = {0, 2<<24, 0, 2<<24}; - const uint32x4_t s_one = vld1q_u32(w_one); - const uint32x4_t s_two = vld1q_u32(w_two); - - const size_t blockSize = 8; - const size_t neonBlockSize = 16; - - size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize; - size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0; - size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize; - - // Clang and Coverity are generating findings using xorBlocks as a flag. - const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); - const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); - - if (flags & BT_ReverseDirection) - { - inBlocks = PtrAdd(inBlocks, length - neonBlockSize); - xorBlocks = PtrAdd(xorBlocks, length - neonBlockSize); - outBlocks = PtrAdd(outBlocks, length - neonBlockSize); - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BT_AllowParallel) - { - while (length >= 6*neonBlockSize) - { - uint32x4_t block0, block1, block2, block3, block4, block5; - if (flags & BT_InBlockIsCounter) - { - // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. - // After the dup load we have two counters in the NEON word. Then we need - // to increment the low ctr by 0 and the high ctr by 1. - const uint8x8_t ctr = vld1_u8(inBlocks); - block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr))); - - // After initial increment of {0,1} remaining counters increment by {2,2}. - block1 = vaddq_u32(s_two, block0); - block2 = vaddq_u32(s_two, block1); - block3 = vaddq_u32(s_two, block2); - block4 = vaddq_u32(s_two, block3); - block5 = vaddq_u32(s_two, block4); - - vst1_u8(const_cast(inBlocks), vget_low_u8( - vreinterpretq_u8_u32(vaddq_u32(s_two, block5)))); - } - else - { - block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - } - - if (xorInput) - { - block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - - if (xorOutput) - { - block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0)); - outBlocks = PtrAdd(outBlocks, outIncrement); - vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1)); - outBlocks = PtrAdd(outBlocks, outIncrement); - vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2)); - outBlocks = PtrAdd(outBlocks, outIncrement); - vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3)); - outBlocks = PtrAdd(outBlocks, outIncrement); - vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4)); - outBlocks = PtrAdd(outBlocks, outIncrement); - vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5)); - outBlocks = PtrAdd(outBlocks, outIncrement); - - length -= 6*neonBlockSize; - } - - while (length >= 2*neonBlockSize) - { - uint32x4_t block0, block1; - if (flags & BT_InBlockIsCounter) - { - // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. - // After the dup load we have two counters in the NEON word. Then we need - // to increment the low ctr by 0 and the high ctr by 1. - const uint8x8_t ctr = vld1_u8(inBlocks); - block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr))); - - // After initial increment of {0,1} remaining counters increment by {2,2}. - block1 = vaddq_u32(s_two, block0); - - vst1_u8(const_cast(inBlocks), vget_low_u8( - vreinterpretq_u8_u32(vaddq_u32(s_two, block1)))); - } - else - { - block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - } - - if (xorInput) - { - block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - func2(block0, block1, subKeys, static_cast(rounds)); - - if (xorOutput) - { - block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0)); - outBlocks = PtrAdd(outBlocks, outIncrement); - vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1)); - outBlocks = PtrAdd(outBlocks, outIncrement); - - length -= 2*neonBlockSize; - } - } - - if (length) - { - // Adjust to real block size - if (flags & BT_ReverseDirection) - { - inIncrement += inIncrement ? blockSize : 0; - xorIncrement += xorIncrement ? blockSize : 0; - outIncrement += outIncrement ? blockSize : 0; - inBlocks = PtrSub(inBlocks, inIncrement); - xorBlocks = PtrSub(xorBlocks, xorIncrement); - outBlocks = PtrSub(outBlocks, outIncrement); - } - else - { - inIncrement -= inIncrement ? blockSize : 0; - xorIncrement -= xorIncrement ? blockSize : 0; - outIncrement -= outIncrement ? blockSize : 0; - } - - while (length >= blockSize) - { - uint32x4_t block, zero = {0}; - - const uint8x8_t v = vld1_u8(inBlocks); - block = vreinterpretq_u32_u8(vcombine_u8(v,v)); - - if (xorInput) - { - const uint8x8_t x = vld1_u8(xorBlocks); - block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x))); - } - - if (flags & BT_InBlockIsCounter) - const_cast(inBlocks)[7]++; - - func2(block, zero, subKeys, static_cast(rounds)); - - if (xorOutput) - { - const uint8x8_t x = vld1_u8(xorBlocks); - block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x))); - } - - vst1_u8(const_cast(outBlocks), - vget_low_u8(vreinterpretq_u8_u32(block))); - - inBlocks = PtrAdd(inBlocks, inIncrement); - outBlocks = PtrAdd(outBlocks, outIncrement); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - length -= blockSize; - } - } - - return length; -} - /// \brief AdvancedProcessBlocks for 1 and 6 blocks /// \tparam F1 function to process 1 128-bit block /// \tparam F6 function to process 6 128-bit blocks @@ -870,412 +622,6 @@ NAMESPACE_END // CryptoPP NAMESPACE_BEGIN(CryptoPP) -/// \brief AdvancedProcessBlocks for 1 and 2 blocks -/// \tparam F1 function to process 1 64-bit block -/// \tparam F2 function to process 2 64-bit blocks -/// \tparam W word type of the subkey table -/// \details AdvancedProcessBlocks64_2x1_SSE processes 2 and 1 SSE SIMD words -/// at a time. -/// \details The subkey type is usually word32 or word64. F1 and F2 must use the -/// same word type. -template -inline size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2, - MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks, - const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 8); - - const size_t blockSize = 8; - const size_t xmmBlockSize = 16; - - size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize; - size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0; - size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize; - - // Clang and Coverity are generating findings using xorBlocks as a flag. - const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); - const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); - - if (flags & BT_ReverseDirection) - { - inBlocks = PtrAdd(inBlocks, length - xmmBlockSize); - xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize); - outBlocks = PtrAdd(outBlocks, length - xmmBlockSize); - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BT_AllowParallel) - { - double temp[2]; - while (length >= 2*xmmBlockSize) - { - __m128i block0, block1; - if (flags & BT_InBlockIsCounter) - { - // Increment of 1 and 2 in big-endian compatible with the ctr byte array. - const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0); - const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0); - - // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. - // After the dup load we have two counters in the XMM word. Then we need - // to increment the low ctr by 0 and the high ctr by 1. - std::memcpy(temp, inBlocks, blockSize); - block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp))); - - // After initial increment of {0,1} remaining counters increment by {2,2}. - block1 = _mm_add_epi32(s_two, block0); - - // Store the next counter. When BT_InBlockIsCounter is set then - // inBlocks is backed by m_counterArray which is non-const. - _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1))); - std::memcpy(const_cast(inBlocks), temp, blockSize); - } - else - { - block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - } - - if (xorInput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - func2(block0, block1, subKeys, static_cast(rounds)); - - if (xorOutput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks = PtrAdd(outBlocks, outIncrement); - - length -= 2*xmmBlockSize; - } - } - - if (length) - { - // Adjust to real block size - if (flags & BT_ReverseDirection) - { - inIncrement += inIncrement ? blockSize : 0; - xorIncrement += xorIncrement ? blockSize : 0; - outIncrement += outIncrement ? blockSize : 0; - inBlocks = PtrSub(inBlocks, inIncrement); - xorBlocks = PtrSub(xorBlocks, xorIncrement); - outBlocks = PtrSub(outBlocks, outIncrement); - } - else - { - inIncrement -= inIncrement ? blockSize : 0; - xorIncrement -= xorIncrement ? blockSize : 0; - outIncrement -= outIncrement ? blockSize : 0; - } - - while (length >= blockSize) - { - double temp[2]; - std::memcpy(temp, inBlocks, blockSize); - __m128i block = _mm_castpd_si128(_mm_load_sd(temp)); - - if (xorInput) - { - std::memcpy(temp, xorBlocks, blockSize); - block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp))); - } - - if (flags & BT_InBlockIsCounter) - const_cast(inBlocks)[7]++; - - func1(block, subKeys, static_cast(rounds)); - - if (xorOutput) - { - std::memcpy(temp, xorBlocks, blockSize); - block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp))); - } - - _mm_store_sd(temp, _mm_castsi128_pd(block)); - std::memcpy(outBlocks, temp, blockSize); - - inBlocks = PtrAdd(inBlocks, inIncrement); - outBlocks = PtrAdd(outBlocks, outIncrement); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - length -= blockSize; - } - } - - return length; -} - -/// \brief AdvancedProcessBlocks for 2 and 6 blocks -/// \tparam F2 function to process 2 64-bit blocks -/// \tparam F6 function to process 6 64-bit blocks -/// \tparam W word type of the subkey table -/// \details AdvancedProcessBlocks64_6x2_SSE processes 6 and 2 SSE SIMD words -/// at a time. For a single block the template uses F2 with a zero block. -/// \details The subkey type is usually word32 or word64. F2 and F6 must use the -/// same word type. -template -inline size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, - MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks, - const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 8); - - const size_t blockSize = 8; - const size_t xmmBlockSize = 16; - - size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize; - size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0; - size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize; - - // Clang and Coverity are generating findings using xorBlocks as a flag. - const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); - const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); - - if (flags & BT_ReverseDirection) - { - inBlocks = PtrAdd(inBlocks, length - xmmBlockSize); - xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize); - outBlocks = PtrAdd(outBlocks, length - xmmBlockSize); - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BT_AllowParallel) - { - double temp[2]; - while (length >= 6*xmmBlockSize) - { - __m128i block0, block1, block2, block3, block4, block5; - if (flags & BT_InBlockIsCounter) - { - // Increment of 1 and 2 in big-endian compatible with the ctr byte array. - const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0); - const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0); - - // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. - // After the dup load we have two counters in the XMM word. Then we need - // to increment the low ctr by 0 and the high ctr by 1. - std::memcpy(temp, inBlocks, blockSize); - block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp))); - - // After initial increment of {0,1} remaining counters increment by {2,2}. - block1 = _mm_add_epi32(s_two, block0); - block2 = _mm_add_epi32(s_two, block1); - block3 = _mm_add_epi32(s_two, block2); - block4 = _mm_add_epi32(s_two, block3); - block5 = _mm_add_epi32(s_two, block4); - - // Store the next counter. When BT_InBlockIsCounter is set then - // inBlocks is backed by m_counterArray which is non-const. - _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5))); - std::memcpy(const_cast(inBlocks), temp, blockSize); - } - else - { - block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - } - - if (xorInput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - - if (xorOutput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block2); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block3); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block4); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block5); - outBlocks = PtrAdd(outBlocks, outIncrement); - - length -= 6*xmmBlockSize; - } - - while (length >= 2*xmmBlockSize) - { - __m128i block0, block1; - if (flags & BT_InBlockIsCounter) - { - // Increment of 1 and 2 in big-endian compatible with the ctr byte array. - const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0); - const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0); - - // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. - // After the dup load we have two counters in the XMM word. Then we need - // to increment the low ctr by 0 and the high ctr by 1. - std::memcpy(temp, inBlocks, blockSize); - block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp))); - - // After initial increment of {0,1} remaining counters increment by {2,2}. - block1 = _mm_add_epi32(s_two, block0); - - // Store the next counter. When BT_InBlockIsCounter is set then - // inBlocks is backed by m_counterArray which is non-const. - _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1))); - std::memcpy(const_cast(inBlocks), temp, blockSize); - } - else - { - block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - } - - if (xorInput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - func2(block0, block1, subKeys, static_cast(rounds)); - - if (xorOutput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks = PtrAdd(outBlocks, outIncrement); - - length -= 2*xmmBlockSize; - } - } - - if (length) - { - // Adjust to real block size - if (flags & BT_ReverseDirection) - { - inIncrement += inIncrement ? blockSize : 0; - xorIncrement += xorIncrement ? blockSize : 0; - outIncrement += outIncrement ? blockSize : 0; - inBlocks = PtrSub(inBlocks, inIncrement); - xorBlocks = PtrSub(xorBlocks, xorIncrement); - outBlocks = PtrSub(outBlocks, outIncrement); - } - else - { - inIncrement -= inIncrement ? blockSize : 0; - xorIncrement -= xorIncrement ? blockSize : 0; - outIncrement -= outIncrement ? blockSize : 0; - } - - while (length >= blockSize) - { - double temp[2]; - __m128i block, zero = _mm_setzero_si128(); - std::memcpy(temp, inBlocks, blockSize); - block = _mm_castpd_si128(_mm_load_sd(temp)); - - if (xorInput) - { - std::memcpy(temp, xorBlocks, blockSize); - block = _mm_xor_si128(block, - _mm_castpd_si128(_mm_load_sd(temp))); - } - - if (flags & BT_InBlockIsCounter) - const_cast(inBlocks)[7]++; - - func2(block, zero, subKeys, static_cast(rounds)); - - if (xorOutput) - { - std::memcpy(temp, xorBlocks, blockSize); - block = _mm_xor_si128(block, - _mm_castpd_si128(_mm_load_sd(temp))); - } - - _mm_store_sd(temp, _mm_castsi128_pd(block)); - std::memcpy(outBlocks, temp, blockSize); - - inBlocks = PtrAdd(inBlocks, inIncrement); - outBlocks = PtrAdd(outBlocks, outIncrement); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - length -= blockSize; - } - } - - return length; -} - /// \brief AdvancedProcessBlocks for 2 and 6 blocks /// \tparam F2 function to process 2 128-bit blocks /// \tparam F6 function to process 6 128-bit blocks @@ -1602,179 +948,6 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, return length; } -/// \brief AdvancedProcessBlocks for 1 and 4 blocks -/// \tparam F1 function to process 1 64-bit block -/// \tparam F4 function to process 6 64-bit blocks -/// \tparam W word type of the subkey table -/// \details AdvancedProcessBlocks64_4x1_SSE processes 4 and 1 SSE SIMD words -/// at a time. -/// \details The subkey type is usually word32 or word64. F1 and F4 must use the -/// same word type. -template -inline size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4, - MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks, - const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 8); - - const size_t blockSize = 8; - const size_t xmmBlockSize = 16; - - size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize; - size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0; - size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize; - - // Clang and Coverity are generating findings using xorBlocks as a flag. - const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); - const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); - - if (flags & BT_ReverseDirection) - { - inBlocks = PtrAdd(inBlocks, length - xmmBlockSize); - xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize); - outBlocks = PtrAdd(outBlocks, length - xmmBlockSize); - inIncrement = 0 - inIncrement; - xorIncrement = 0 - xorIncrement; - outIncrement = 0 - outIncrement; - } - - if (flags & BT_AllowParallel) - { - while (length >= 4*xmmBlockSize) - { - __m128i block0, block1, block2, block3; - if (flags & BT_InBlockIsCounter) - { - // Increment of 1 and 2 in big-endian compatible with the ctr byte array. - const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0); - const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0); - double temp[2]; - - // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. - // After the dup load we have two counters in the XMM word. Then we need - // to increment the low ctr by 0 and the high ctr by 1. - std::memcpy(temp, inBlocks, blockSize); - block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp))); - - // After initial increment of {0,1} remaining counters increment by {2,2}. - block1 = _mm_add_epi32(s_two, block0); - block2 = _mm_add_epi32(s_two, block1); - block3 = _mm_add_epi32(s_two, block2); - - // Store the next counter. When BT_InBlockIsCounter is set then - // inBlocks is backed by m_counterArray which is non-const. - _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3))); - std::memcpy(const_cast(inBlocks), temp, blockSize); - } - else - { - block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks = PtrAdd(inBlocks, inIncrement); - } - - if (xorInput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - func4(block0, block1, block2, block3, subKeys, static_cast(rounds)); - - if (xorOutput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block2); - outBlocks = PtrAdd(outBlocks, outIncrement); - _mm_storeu_si128(M128_CAST(outBlocks), block3); - outBlocks = PtrAdd(outBlocks, outIncrement); - - length -= 4*xmmBlockSize; - } - } - - if (length) - { - // Adjust to real block size - if (flags & BT_ReverseDirection) - { - inIncrement += inIncrement ? blockSize : 0; - xorIncrement += xorIncrement ? blockSize : 0; - outIncrement += outIncrement ? blockSize : 0; - inBlocks = PtrSub(inBlocks, inIncrement); - xorBlocks = PtrSub(xorBlocks, xorIncrement); - outBlocks = PtrSub(outBlocks, outIncrement); - } - else - { - inIncrement -= inIncrement ? blockSize : 0; - xorIncrement -= xorIncrement ? blockSize : 0; - outIncrement -= outIncrement ? blockSize : 0; - } - - while (length >= blockSize) - { - double temp[2]; - std::memcpy(temp, inBlocks, blockSize); - __m128i block = _mm_castpd_si128(_mm_load_sd(temp)); - - if (xorInput) - { - std::memcpy(temp, xorBlocks, blockSize); - block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp))); - } - - if (flags & BT_InBlockIsCounter) - const_cast(inBlocks)[7]++; - - func1(block, subKeys, static_cast(rounds)); - - if (xorOutput) - { - std::memcpy(temp, xorBlocks, blockSize); - block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp))); - } - - _mm_store_sd(temp, _mm_castsi128_pd(block)); - std::memcpy(outBlocks, temp, blockSize); - - inBlocks = PtrAdd(inBlocks, inIncrement); - outBlocks = PtrAdd(outBlocks, outIncrement); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - length -= blockSize; - } - } - - return length; -} - NAMESPACE_END // CryptoPP #endif // CRYPTOPP_SSSE3_AVAILABLE @@ -1785,277 +958,6 @@ NAMESPACE_END // CryptoPP NAMESPACE_BEGIN(CryptoPP) -/// \brief AdvancedProcessBlocks for 2 and 6 blocks -/// \tparam F2 function to process 2 128-bit blocks -/// \tparam F6 function to process 6 128-bit blocks -/// \tparam W word type of the subkey table -/// \details AdvancedProcessBlocks64_6x2_Altivec processes 6 and 2 Altivec SIMD words -/// at a time. For a single block the template uses F2 with a zero block. -/// \details The subkey type is usually word32 or word64. F2 and F6 must use the -/// same word type. -template -inline size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6, - const W *subKeys, size_t rounds, const byte *inBlocks, - const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 8); - -#if (CRYPTOPP_LITTLE_ENDIAN) - enum {LowOffset=8, HighOffset=0}; - const uint32x4_p s_one = {1,0,0,0}; - const uint32x4_p s_two = {2,0,2,0}; -#else - enum {LowOffset=8, HighOffset=0}; - const uint32x4_p s_one = {0,0,0,1}; - const uint32x4_p s_two = {0,2,0,2}; -#endif - - const size_t blockSize = 8; - const size_t simdBlockSize = 16; - CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16]; - - size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : simdBlockSize; - size_t xorIncrement = (xorBlocks != NULLPTR) ? simdBlockSize : 0; - size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : simdBlockSize; - - // Clang and Coverity are generating findings using xorBlocks as a flag. - const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); - const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); - - if (flags & BT_ReverseDirection) - { - inBlocks = PtrAdd(inBlocks, length - simdBlockSize); - xorBlocks = PtrAdd(xorBlocks, length - simdBlockSize); - outBlocks = PtrAdd(outBlocks, length - simdBlockSize); - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BT_AllowParallel) - { - while (length >= 6*simdBlockSize) - { - uint32x4_p block0, block1, block2, block3, block4, block5; - if (flags & BT_InBlockIsCounter) - { - // There is no easy way to load 8-bytes into a vector. It is - // even harder without POWER8 due to lack of 64-bit elements. - std::memcpy(temp+LowOffset, inBlocks, 8); - std::memcpy(temp+HighOffset, inBlocks, 8); - uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp); - - // For 64-bit block ciphers we need to load the CTR block, - // which is 8 bytes. After the dup load we have two counters - // in the Altivec word. Then we need to increment the low ctr - // by 0 and the high ctr by 1. - block0 = VecAdd(s_one, ctr); - - // After initial increment of {0,1} remaining counters - // increment by {2,2}. - block1 = VecAdd(s_two, block0); - block2 = VecAdd(s_two, block1); - block3 = VecAdd(s_two, block2); - block4 = VecAdd(s_two, block3); - block5 = VecAdd(s_two, block4); - - // Update the counter in the caller. - const_cast(inBlocks)[7] += 12; - } - else - { - block0 = VecLoadBE(inBlocks); - inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = VecLoadBE(inBlocks); - inBlocks = PtrAdd(inBlocks, inIncrement); - block2 = VecLoadBE(inBlocks); - inBlocks = PtrAdd(inBlocks, inIncrement); - block3 = VecLoadBE(inBlocks); - inBlocks = PtrAdd(inBlocks, inIncrement); - block4 = VecLoadBE(inBlocks); - inBlocks = PtrAdd(inBlocks, inIncrement); - block5 = VecLoadBE(inBlocks); - inBlocks = PtrAdd(inBlocks, inIncrement); - } - - if (xorInput) - { - block0 = VecXor(block0, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VecXor(block1, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VecXor(block2, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VecXor(block3, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = VecXor(block4, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = VecXor(block5, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - - if (xorOutput) - { - block0 = VecXor(block0, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VecXor(block1, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VecXor(block2, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VecXor(block3, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = VecXor(block4, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = VecXor(block5, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - VecStoreBE(block0, outBlocks); - outBlocks = PtrAdd(outBlocks, outIncrement); - VecStoreBE(block1, outBlocks); - outBlocks = PtrAdd(outBlocks, outIncrement); - VecStoreBE(block2, outBlocks); - outBlocks = PtrAdd(outBlocks, outIncrement); - VecStoreBE(block3, outBlocks); - outBlocks = PtrAdd(outBlocks, outIncrement); - VecStoreBE(block4, outBlocks); - outBlocks = PtrAdd(outBlocks, outIncrement); - VecStoreBE(block5, outBlocks); - outBlocks = PtrAdd(outBlocks, outIncrement); - - length -= 6*simdBlockSize; - } - - while (length >= 2*simdBlockSize) - { - uint32x4_p block0, block1; - if (flags & BT_InBlockIsCounter) - { - // There is no easy way to load 8-bytes into a vector. It is - // even harder without POWER8 due to lack of 64-bit elements. - std::memcpy(temp+LowOffset, inBlocks, 8); - std::memcpy(temp+HighOffset, inBlocks, 8); - uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp); - - // For 64-bit block ciphers we need to load the CTR block, - // which is 8 bytes. After the dup load we have two counters - // in the Altivec word. Then we need to increment the low ctr - // by 0 and the high ctr by 1. - block0 = VecAdd(s_one, ctr); - - // After initial increment of {0,1} remaining counters - // increment by {2,2}. - block1 = VecAdd(s_two, block0); - - // Update the counter in the caller. - const_cast(inBlocks)[7] += 4; - } - else - { - block0 = VecLoadBE(inBlocks); - inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = VecLoadBE(inBlocks); - inBlocks = PtrAdd(inBlocks, inIncrement); - } - - if (xorInput) - { - block0 = VecXor(block0, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VecXor(block1, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - func2(block0, block1, subKeys, static_cast(rounds)); - - if (xorOutput) - { - block0 = VecXor(block0, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VecXor(block1, VecLoadBE(xorBlocks)); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - } - - VecStoreBE(block0, outBlocks); - outBlocks = PtrAdd(outBlocks, outIncrement); - VecStoreBE(block1, outBlocks); - outBlocks = PtrAdd(outBlocks, outIncrement); - - length -= 2*simdBlockSize; - } - } - - if (length) - { - // Adjust to real block size - if (flags & BT_ReverseDirection) - { - inIncrement += inIncrement ? blockSize : 0; - xorIncrement += xorIncrement ? blockSize : 0; - outIncrement += outIncrement ? blockSize : 0; - inBlocks = PtrSub(inBlocks, inIncrement); - xorBlocks = PtrSub(xorBlocks, xorIncrement); - outBlocks = PtrSub(outBlocks, outIncrement); - } - else - { - inIncrement -= inIncrement ? blockSize : 0; - xorIncrement -= xorIncrement ? blockSize : 0; - outIncrement -= outIncrement ? blockSize : 0; - } - - while (length >= blockSize) - { - uint32x4_p block, zero = {0}; - - // There is no easy way to load 8-bytes into a vector. It is - // even harder without POWER8 due to lack of 64-bit elements. - // The high 8 bytes are "don't care" but it if we don't - // initialize the block then it generates warnings. - std::memcpy(temp+LowOffset, inBlocks, 8); - std::memcpy(temp+HighOffset, inBlocks, 8); // don't care - block = (uint32x4_p)VecLoadBE(temp); - - if (xorInput) - { - std::memcpy(temp+LowOffset, xorBlocks, 8); - std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care - uint32x4_p x = (uint32x4_p)VecLoadBE(temp); - block = VecXor(block, x); - } - - // Update the counter in the caller. - if (flags & BT_InBlockIsCounter) - const_cast(inBlocks)[7]++; - - func2(block, zero, subKeys, static_cast(rounds)); - - if (xorOutput) - { - std::memcpy(temp+LowOffset, xorBlocks, 8); - std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care - uint32x4_p x = (uint32x4_p)VecLoadBE(temp); - block = VecXor(block, x); - } - - VecStoreBE(block, temp); - std::memcpy(outBlocks, temp+LowOffset, 8); - - inBlocks = PtrAdd(inBlocks, inIncrement); - outBlocks = PtrAdd(outBlocks, outIncrement); - xorBlocks = PtrAdd(xorBlocks, xorIncrement); - length -= blockSize; - } - } - - return length; -} - /// \brief AdvancedProcessBlocks for 1 and 4 blocks /// \tparam F1 function to process 1 128-bit block /// \tparam F4 function to process 4 128-bit blocks diff --git a/cham.cpp b/cham.cpp index 620b7953..1a05721d 100644 --- a/cham.cpp +++ b/cham.cpp @@ -96,7 +96,7 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) -#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS +#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS # if (CRYPTOPP_SSSE3_AVAILABLE) extern size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -110,11 +110,11 @@ extern size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, siz extern size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); # endif // CRYPTOPP_SSSE3_AVAILABLE -#endif // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS +#endif // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS std::string CHAM64::Base::AlgorithmProvider() const { -#if (CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS) +#if (CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS) # if defined(CRYPTOPP_SSSE3_AVAILABLE) if (HasSSSE3()) return "SSSE3"; @@ -336,31 +336,7 @@ void CHAM128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, oblock(m_x[0])(m_x[1])(m_x[2])(m_x[3]); } -#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS -size_t CHAM64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, - byte *outBlocks, size_t length, word32 flags) const -{ -# if (CRYPTOPP_SSSE3_AVAILABLE) - if (HasSSSE3()) { - return CHAM64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, 80, - inBlocks, xorBlocks, outBlocks, length, flags); - } -# endif // CRYPTOPP_SSSE3_AVAILABLE - return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t CHAM64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, - byte *outBlocks, size_t length, word32 flags) const -{ -# if (CRYPTOPP_SSSE3_AVAILABLE) - if (HasSSSE3()) { - return CHAM64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, 80, - inBlocks, xorBlocks, outBlocks, length, flags); - } -# endif // CRYPTOPP_SSSE3_AVAILABLE - return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); -} - +#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS size_t CHAM128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { @@ -386,6 +362,6 @@ size_t CHAM128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor # endif // CRYPTOPP_SSSE3_AVAILABLE return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } -#endif // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS +#endif // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS NAMESPACE_END diff --git a/cham.h b/cham.h index 9408e407..30a84b4e 100644 --- a/cham.h +++ b/cham.h @@ -16,18 +16,15 @@ #include "algparam.h" #if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86) -# define CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS 1 +# define CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS 1 #endif // Yet another SunStudio/SunCC workaround. Failed self tests // in SSE code paths on i386 for SunStudio 12.3 and below. #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) -# undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS +# undef CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS #endif -// https://github.com/weidai11/cryptopp/issues/945 -#undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS - NAMESPACE_BEGIN(CryptoPP) /// \brief CHAM block cipher information @@ -92,10 +89,6 @@ public: { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; - -#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS - size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; -#endif }; /// \brief Decryption transformation @@ -106,10 +99,6 @@ public: { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; - -#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS - size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; -#endif }; /// \brief CHAM64 encryption @@ -156,7 +145,7 @@ public: public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; -#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS +#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; #endif }; @@ -170,7 +159,7 @@ public: public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; -#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS +#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; #endif }; diff --git a/cham_simd.cpp b/cham_simd.cpp index 42a76c6e..b848ba10 100644 --- a/cham_simd.cpp +++ b/cham_simd.cpp @@ -45,600 +45,6 @@ using CryptoPP::word32; ////////////////////////////////////////////////////////////////////////// -NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size - -template -inline __m128i RotateLeft16(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi16(val, R); -#else - return _mm_or_si128( - _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R)); -#endif -} - -template -inline __m128i RotateRight16(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi16(val, 16-R); -#else - return _mm_or_si128( - _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R)); -#endif -} - -template <> -inline __m128i RotateLeft16<8>(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi16(val, 8); -#else - const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1); - return _mm_shuffle_epi8(val, mask); -#endif -} - -template <> -inline __m128i RotateRight16<8>(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi16(val, 16-8); -#else - const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1); - return _mm_shuffle_epi8(val, mask); -#endif -} - -template -inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // Should not be instantiated - CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); - CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); - CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f); - CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h); - CRYPTOPP_ASSERT(0); - return _mm_setzero_si128(); -} - -template <> -inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // The shuffle converts to and from little-endian for SSE. A specialized - // CHAM implementation can avoid the shuffle by framing the data for - // encryption, decryption and benchmarks. The library cannot take the - // speed-up because of the byte oriented API. - const __m128i r1 = _mm_unpacklo_epi16(a, b); - const __m128i r2 = _mm_unpacklo_epi16(c, d); - const __m128i r3 = _mm_unpacklo_epi16(e, f); - const __m128i r4 = _mm_unpacklo_epi16(g, h); - - const __m128i r5 = _mm_unpacklo_epi32(r1, r2); - const __m128i r6 = _mm_unpacklo_epi32(r3, r4); - return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6), - _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); -} - -template <> -inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // The shuffle converts to and from little-endian for SSE. A specialized - // CHAM implementation can avoid the shuffle by framing the data for - // encryption, decryption and benchmarks. The library cannot take the - // speed-up because of the byte oriented API. - const __m128i r1 = _mm_unpacklo_epi16(a, b); - const __m128i r2 = _mm_unpacklo_epi16(c, d); - const __m128i r3 = _mm_unpacklo_epi16(e, f); - const __m128i r4 = _mm_unpacklo_epi16(g, h); - - const __m128i r5 = _mm_unpacklo_epi32(r1, r2); - const __m128i r6 = _mm_unpacklo_epi32(r3, r4); - return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6), - _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); -} - -template <> -inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // The shuffle converts to and from little-endian for SSE. A specialized - // CHAM implementation can avoid the shuffle by framing the data for - // encryption, decryption and benchmarks. The library cannot take the - // speed-up because of the byte oriented API. - const __m128i r1 = _mm_unpacklo_epi16(a, b); - const __m128i r2 = _mm_unpacklo_epi16(c, d); - const __m128i r3 = _mm_unpacklo_epi16(e, f); - const __m128i r4 = _mm_unpacklo_epi16(g, h); - - const __m128i r5 = _mm_unpackhi_epi32(r1, r2); - const __m128i r6 = _mm_unpackhi_epi32(r3, r4); - return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6), - _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); -} - -template <> -inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // The shuffle converts to and from little-endian for SSE. A specialized - // CHAM implementation can avoid the shuffle by framing the data for - // encryption, decryption and benchmarks. The library cannot take the - // speed-up because of the byte oriented API. - const __m128i r1 = _mm_unpacklo_epi16(a, b); - const __m128i r2 = _mm_unpacklo_epi16(c, d); - const __m128i r3 = _mm_unpacklo_epi16(e, f); - const __m128i r4 = _mm_unpacklo_epi16(g, h); - - const __m128i r5 = _mm_unpackhi_epi32(r1, r2); - const __m128i r6 = _mm_unpackhi_epi32(r3, r4); - return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6), - _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); -} - -template <> -inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // The shuffle converts to and from little-endian for SSE. A specialized - // CHAM implementation can avoid the shuffle by framing the data for - // encryption, decryption and benchmarks. The library cannot take the - // speed-up because of the byte oriented API. - const __m128i r1 = _mm_unpackhi_epi16(a, b); - const __m128i r2 = _mm_unpackhi_epi16(c, d); - const __m128i r3 = _mm_unpackhi_epi16(e, f); - const __m128i r4 = _mm_unpackhi_epi16(g, h); - - const __m128i r5 = _mm_unpacklo_epi32(r1, r2); - const __m128i r6 = _mm_unpacklo_epi32(r3, r4); - return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6), - _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); -} - -template <> -inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // The shuffle converts to and from little-endian for SSE. A specialized - // CHAM implementation can avoid the shuffle by framing the data for - // encryption, decryption and benchmarks. The library cannot take the - // speed-up because of the byte oriented API. - const __m128i r1 = _mm_unpackhi_epi16(a, b); - const __m128i r2 = _mm_unpackhi_epi16(c, d); - const __m128i r3 = _mm_unpackhi_epi16(e, f); - const __m128i r4 = _mm_unpackhi_epi16(g, h); - - const __m128i r5 = _mm_unpacklo_epi32(r1, r2); - const __m128i r6 = _mm_unpacklo_epi32(r3, r4); - return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6), - _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); -} - -template <> -inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // The shuffle converts to and from little-endian for SSE. A specialized - // CHAM implementation can avoid the shuffle by framing the data for - // encryption, decryption and benchmarks. The library cannot take the - // speed-up because of the byte oriented API. - const __m128i r1 = _mm_unpackhi_epi16(a, b); - const __m128i r2 = _mm_unpackhi_epi16(c, d); - const __m128i r3 = _mm_unpackhi_epi16(e, f); - const __m128i r4 = _mm_unpackhi_epi16(g, h); - - const __m128i r5 = _mm_unpackhi_epi32(r1, r2); - const __m128i r6 = _mm_unpackhi_epi32(r3, r4); - return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6), - _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); -} - -template <> -inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - // The shuffle converts to and from little-endian for SSE. A specialized - // CHAM implementation can avoid the shuffle by framing the data for - // encryption, decryption and benchmarks. The library cannot take the - // speed-up because of the byte oriented API. - const __m128i r1 = _mm_unpackhi_epi16(a, b); - const __m128i r2 = _mm_unpackhi_epi16(c, d); - const __m128i r3 = _mm_unpackhi_epi16(e, f); - const __m128i r4 = _mm_unpackhi_epi16(g, h); - - const __m128i r5 = _mm_unpackhi_epi32(r1, r2); - const __m128i r6 = _mm_unpackhi_epi32(r3, r4); - return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6), - _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); -} - -template -inline __m128i UnpackXMM(const __m128i& v) -{ - // Should not be instantiated - CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0); - - return _mm_setzero_si128(); -} - -template <> -inline __m128i UnpackXMM<0>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1)); -} - -template <> -inline __m128i UnpackXMM<1>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3)); -} - -template <> -inline __m128i UnpackXMM<2>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5)); -} - -template <> -inline __m128i UnpackXMM<3>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7)); -} - -template <> -inline __m128i UnpackXMM<4>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9)); -} - -template <> -inline __m128i UnpackXMM<5>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11)); -} - -template <> -inline __m128i UnpackXMM<6>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13)); -} - -template <> -inline __m128i UnpackXMM<7>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15)); -} - -template -inline __m128i UnpackXMM(const __m128i& a, const __m128i& b) -{ - const __m128i& z = _mm_setzero_si128(); - return UnpackXMM(a, b, z, z, z, z, z, z); -} - -template -inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d, - const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h) -{ - return UnpackXMM(a, b, c, d, e, f, g, h); -} - -template -inline __m128i RepackXMM(const __m128i& v) -{ - return UnpackXMM(v); -} - -inline void CHAM64_Enc_Block(__m128i &block0, - const word16 *subkeys, unsigned int /*rounds*/) -{ - // Rearrange the data for vectorization. UnpackXMM includes a - // little-endian swap for SSE. Thanks to Peter Cordes for help - // with packing and unpacking. - // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... - __m128i a = UnpackXMM<0>(block0); - __m128i b = UnpackXMM<1>(block0); - __m128i c = UnpackXMM<2>(block0); - __m128i d = UnpackXMM<3>(block0); - __m128i e = UnpackXMM<4>(block0); - __m128i f = UnpackXMM<5>(block0); - __m128i g = UnpackXMM<6>(block0); - __m128i h = UnpackXMM<7>(block0); - - const unsigned int rounds = 80; - __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0); - __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1); - - const unsigned int MASK = 15; - for (int i=0; i(rounds); i+=4) - { - __m128i k, kr, t1, t2, t3, t4; - k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK]))); - - // Shuffle out key - kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0)); - - t1 = _mm_xor_si128(a, counter); - t3 = _mm_xor_si128(e, counter); - t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); - t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); - a = RotateLeft16<8>(_mm_add_epi16(t1, t2)); - e = RotateLeft16<8>(_mm_add_epi16(t3, t4)); - - counter = _mm_add_epi16(counter, increment); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2)); - - t1 = _mm_xor_si128(b, counter); - t3 = _mm_xor_si128(f, counter); - t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); - t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); - b = RotateLeft16<1>(_mm_add_epi16(t1, t2)); - f = RotateLeft16<1>(_mm_add_epi16(t3, t4)); - - counter = _mm_add_epi16(counter, increment); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4)); - - t1 = _mm_xor_si128(c, counter); - t3 = _mm_xor_si128(g, counter); - t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); - t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); - c = RotateLeft16<8>(_mm_add_epi16(t1, t2)); - g = RotateLeft16<8>(_mm_add_epi16(t3, t4)); - - counter = _mm_add_epi16(counter, increment); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6)); - - t1 = _mm_xor_si128(d, counter); - t3 = _mm_xor_si128(h, counter); - t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); - t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); - d = RotateLeft16<1>(_mm_add_epi16(t1, t2)); - h = RotateLeft16<1>(_mm_add_epi16(t3, t4)); - - counter = _mm_add_epi16(counter, increment); - } - - // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... - block0 = RepackXMM<0>(a,b,c,d,e,f,g,h); -} - -inline void CHAM64_Dec_Block(__m128i &block0, - const word16 *subkeys, unsigned int /*rounds*/) -{ - // Rearrange the data for vectorization. UnpackXMM includes a - // little-endian swap for SSE. Thanks to Peter Cordes for help - // with packing and unpacking. - // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... - __m128i a = UnpackXMM<0>(block0); - __m128i b = UnpackXMM<1>(block0); - __m128i c = UnpackXMM<2>(block0); - __m128i d = UnpackXMM<3>(block0); - __m128i e = UnpackXMM<4>(block0); - __m128i f = UnpackXMM<5>(block0); - __m128i g = UnpackXMM<6>(block0); - __m128i h = UnpackXMM<7>(block0); - - const unsigned int rounds = 80; - __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1); - __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1); - - const unsigned int MASK = 15; - for (int i = static_cast(rounds)-1; i >= 0; i-=4) - { - __m128i k, kr, t1, t2, t3, t4; - k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK]))); - - // Shuffle out key - kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6)); - - // Odd round - t1 = RotateRight16<1>(d); - t3 = RotateRight16<1>(h); - t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); - t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); - d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); - h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); - - counter = _mm_sub_epi16(counter, decrement); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4)); - - // Even round - t1 = RotateRight16<8>(c); - t3 = RotateRight16<8>(g); - t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); - t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); - c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); - g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); - - counter = _mm_sub_epi16(counter, decrement); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2)); - - // Odd round - t1 = RotateRight16<1>(b); - t3 = RotateRight16<1>(f); - t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); - t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); - b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); - f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); - - counter = _mm_sub_epi16(counter, decrement); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0)); - - // Even round - t1 = RotateRight16<8>(a); - t3 = RotateRight16<8>(e); - t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); - t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); - a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); - e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); - - counter = _mm_sub_epi16(counter, decrement); - } - - // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... - block0 = RepackXMM<0>(a,b,c,d,e,f,g,h); -} - -inline void CHAM64_Enc_2_Blocks(__m128i &block0, - __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/) -{ - // Rearrange the data for vectorization. UnpackXMM includes a - // little-endian swap for SSE. Thanks to Peter Cordes for help - // with packing and unpacking. - // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... - __m128i a = UnpackXMM<0>(block0, block1); - __m128i b = UnpackXMM<1>(block0, block1); - __m128i c = UnpackXMM<2>(block0, block1); - __m128i d = UnpackXMM<3>(block0, block1); - __m128i e = UnpackXMM<4>(block0, block1); - __m128i f = UnpackXMM<5>(block0, block1); - __m128i g = UnpackXMM<6>(block0, block1); - __m128i h = UnpackXMM<7>(block0, block1); - - const unsigned int rounds = 80; - __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0); - __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1); - - const unsigned int MASK = 15; - for (int i=0; i(rounds); i+=4) - { - __m128i k, kr, t1, t2, t3, t4; - k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK]))); - - // Shuffle out key - kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0)); - - t1 = _mm_xor_si128(a, counter); - t3 = _mm_xor_si128(e, counter); - t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); - t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); - a = RotateLeft16<8>(_mm_add_epi16(t1, t2)); - e = RotateLeft16<8>(_mm_add_epi16(t3, t4)); - - counter = _mm_add_epi16(counter, increment); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2)); - - t1 = _mm_xor_si128(b, counter); - t3 = _mm_xor_si128(f, counter); - t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); - t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); - b = RotateLeft16<1>(_mm_add_epi16(t1, t2)); - f = RotateLeft16<1>(_mm_add_epi16(t3, t4)); - - counter = _mm_add_epi16(counter, increment); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4)); - - t1 = _mm_xor_si128(c, counter); - t3 = _mm_xor_si128(g, counter); - t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); - t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); - c = RotateLeft16<8>(_mm_add_epi16(t1, t2)); - g = RotateLeft16<8>(_mm_add_epi16(t3, t4)); - - counter = _mm_add_epi16(counter, increment); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6)); - - t1 = _mm_xor_si128(d, counter); - t3 = _mm_xor_si128(h, counter); - t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); - t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); - d = RotateLeft16<1>(_mm_add_epi16(t1, t2)); - h = RotateLeft16<1>(_mm_add_epi16(t3, t4)); - - counter = _mm_add_epi16(counter, increment); - } - - // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... - block0 = RepackXMM<0>(a,b,c,d,e,f,g,h); - block1 = RepackXMM<1>(a,b,c,d,e,f,g,h); -} - -inline void CHAM64_Dec_2_Blocks(__m128i &block0, - __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/) -{ - // Rearrange the data for vectorization. UnpackXMM includes a - // little-endian swap for SSE. Thanks to Peter Cordes for help - // with packing and unpacking. - // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... - __m128i a = UnpackXMM<0>(block0, block1); - __m128i b = UnpackXMM<1>(block0, block1); - __m128i c = UnpackXMM<2>(block0, block1); - __m128i d = UnpackXMM<3>(block0, block1); - __m128i e = UnpackXMM<4>(block0, block1); - __m128i f = UnpackXMM<5>(block0, block1); - __m128i g = UnpackXMM<6>(block0, block1); - __m128i h = UnpackXMM<7>(block0, block1); - - const unsigned int rounds = 80; - __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1); - __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1); - - const unsigned int MASK = 15; - for (int i = static_cast(rounds)-1; i >= 0; i-=4) - { - __m128i k, kr, t1, t2, t3, t4; - k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK]))); - - // Shuffle out key - kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6)); - - // Odd round - t1 = RotateRight16<1>(d); - t3 = RotateRight16<1>(h); - t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); - t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); - d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); - h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); - - counter = _mm_sub_epi16(counter, decrement); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4)); - - // Even round - t1 = RotateRight16<8>(c); - t3 = RotateRight16<8>(g); - t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); - t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); - c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); - g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); - - counter = _mm_sub_epi16(counter, decrement); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2)); - - // Odd round - t1 = RotateRight16<1>(b); - t3 = RotateRight16<1>(f); - t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); - t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); - b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); - f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); - - counter = _mm_sub_epi16(counter, decrement); - kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0)); - - // Even round - t1 = RotateRight16<8>(a); - t3 = RotateRight16<8>(e); - t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); - t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); - a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); - e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); - - counter = _mm_sub_epi16(counter, decrement); - } - - // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... - block0 = RepackXMM<0>(a,b,c,d,e,f,g,h); - block1 = RepackXMM<1>(a,b,c,d,e,f,g,h); -} - -NAMESPACE_END // W16 - -////////////////////////////////////////////////////////////////////////// - NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size template @@ -1054,20 +460,6 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) #if defined(CRYPTOPP_SSSE3_AVAILABLE) -size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { diff --git a/cryptest.nmake b/cryptest.nmake index a9c25798..7690a812 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -78,9 +78,9 @@ LIB_SRCS = \ rdtables.cpp rijndael.cpp rijndael_simd.cpp ripemd.cpp rng.cpp rsa.cpp \ rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp \ sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp shake.cpp \ - shark.cpp sharkbox.cpp simeck.cpp simeck_simd.cpp simon.cpp \ - simon128_simd.cpp simon64_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \ - sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp speck64_simd.cpp \ + shark.cpp sharkbox.cpp simeck.cpp simon.cpp \ + simon128_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \ + sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp \ square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \ threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \ vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp \ @@ -109,9 +109,9 @@ LIB_OBJS = \ rdtables.obj rijndael.obj rijndael_simd.obj ripemd.obj rng.obj rsa.obj \ rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj \ sha.obj sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj shake.obj \ - shark.obj sharkbox.obj simeck.obj simeck_simd.obj simon.obj \ - simon128_simd.obj simon64_simd.obj skipjack.obj sm3.obj sm4.obj \ - sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj speck64_simd.obj \ + shark.obj sharkbox.obj simeck.obj simon.obj \ + simon128_simd.obj skipjack.obj sm3.obj sm4.obj \ + sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj \ square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \ threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \ vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj \ diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index e6a8b3b2..7c1d65ff 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -315,9 +315,7 @@ - - @@ -326,7 +324,6 @@ - diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index 1b368d37..3c8252f7 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -425,15 +425,9 @@ Source Files - - Source Files - Source Files - - Source Files - Source Files @@ -455,9 +449,6 @@ Source Files - - Source Files - Source Files diff --git a/simeck.cpp b/simeck.cpp index 982f5f4a..a1a5316e 100644 --- a/simeck.cpp +++ b/simeck.cpp @@ -33,16 +33,6 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) -#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS -# if (CRYPTOPP_SSSE3_AVAILABLE) -extern size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); - -extern size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); -# endif // CRYPTOPP_SSSE3_AVAILABLE -#endif // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS - std::string SIMECK32::Base::AlgorithmProvider() const { return "C++"; @@ -104,10 +94,6 @@ void SIMECK32::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock std::string SIMECK64::Base::AlgorithmProvider() const { -#if (CRYPTOPP_SSSE3_AVAILABLE) - if (HasSSSE3()) - return "SSSE3"; -#endif return "C++"; } @@ -165,30 +151,4 @@ void SIMECK64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock oblock(m_t[0])(m_t[1]); } -#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS -size_t SIMECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, - byte *outBlocks, size_t length, word32 flags) const -{ -# if (CRYPTOPP_SSSE3_AVAILABLE) - if (HasSSSE3()) { - return SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS, - inBlocks, xorBlocks, outBlocks, length, flags); - } -# endif // CRYPTOPP_SSSE3_AVAILABLE - return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t SIMECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, - byte *outBlocks, size_t length, word32 flags) const -{ -# if (CRYPTOPP_SSSE3_AVAILABLE) - if (HasSSSE3()) { - return SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS, - inBlocks, xorBlocks, outBlocks, length, flags); - } -# endif // CRYPTOPP_SSSE3_AVAILABLE - return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS - NAMESPACE_END diff --git a/simeck.h b/simeck.h index 918400e2..398198d7 100644 --- a/simeck.h +++ b/simeck.h @@ -17,19 +17,6 @@ #include "secblock.h" #include "algparam.h" -#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86) -# define CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS 1 -#endif - -// Yet another SunStudio/SunCC workaround. Failed self tests -// in SSE code paths on i386 for SunStudio 12.3 and below. -#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) -# undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS -#endif - -// https://github.com/weidai11/cryptopp/issues/945 -#undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS - NAMESPACE_BEGIN(CryptoPP) /// \brief SIMECK block cipher information diff --git a/simeck_simd.cpp b/simeck_simd.cpp deleted file mode 100644 index 03263b1d..00000000 --- a/simeck_simd.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// simeck_simd.cpp - written and placed in the public domain by Gangqiang Yang and Jeffrey Walton. -// -// This source file uses intrinsics and built-ins to gain access to -// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate -// source file is needed because additional CXXFLAGS are required to enable -// the appropriate instructions sets in some build configurations. - -#include "pch.h" -#include "config.h" - -#include "simeck.h" -#include "misc.h" - -// Uncomment for benchmarking C++ against SSE or NEON. -// Do so in both simon.cpp and simon_simd.cpp. -// #undef CRYPTOPP_SSSE3_AVAILABLE -// #undef CRYPTOPP_ARM_NEON_AVAILABLE - -#if (CRYPTOPP_SSSE3_AVAILABLE) -# include "adv_simd.h" -# include -# include -#endif - -#if defined(__XOP__) -# include -# if defined(__GNUC__) -# include -# endif -#endif - -// Squash MS LNK4221 and libtool warnings -extern const char SIMECK_SIMD_FNAME[] = __FILE__; - -// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670 -#define M128_CAST(x) ((__m128i *)(void *)(x)) -#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) - -ANONYMOUS_NAMESPACE_BEGIN - -using CryptoPP::word16; -using CryptoPP::word32; - -#if (CRYPTOPP_SSSE3_AVAILABLE) - -////////////////////////////////////////////////////////////////////////// - -template -inline __m128i RotateLeft32(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, R); -#else - return _mm_or_si128( - _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); -#endif -} - -template -inline __m128i RotateRight32(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 32-R); -#else - return _mm_or_si128( - _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); -#endif -} - -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -inline __m128i RotateLeft32<8>(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 8); -#else - const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); - return _mm_shuffle_epi8(val, mask); -#endif -} - -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -inline __m128i RotateRight32<8>(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 32-8); -#else - const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); - return _mm_shuffle_epi8(val, mask); -#endif -} - -/// \brief Unpack XMM words -/// \tparam IDX the element from each XMM word -/// \param a the first XMM word -/// \param b the second XMM word -/// \param c the third XMM word -/// \param d the fourth XMM word -/// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation -/// equivalent to a[IDX] || b[IDX] || c[IDX] || d[IDX]. -template -inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - // Should not be instantiated - CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); - CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); - CRYPTOPP_ASSERT(0); - return _mm_setzero_si128(); -} - -template <> -inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - const __m128i r1 = _mm_unpacklo_epi32(a, b); - const __m128i r2 = _mm_unpacklo_epi32(c, d); - return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2), - _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); -} - -template <> -inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - const __m128i r1 = _mm_unpacklo_epi32(a, b); - const __m128i r2 = _mm_unpacklo_epi32(c, d); - return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2), - _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); -} - -template <> -inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - const __m128i r1 = _mm_unpackhi_epi32(a, b); - const __m128i r2 = _mm_unpackhi_epi32(c, d); - return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2), - _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); -} - -template <> -inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - const __m128i r1 = _mm_unpackhi_epi32(a, b); - const __m128i r2 = _mm_unpackhi_epi32(c, d); - return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2), - _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); -} - -/// \brief Unpack a XMM word -/// \tparam IDX the element from each XMM word -/// \param v the first XMM word -/// \details UnpackXMM selects the IDX element from v and returns a concatenation -/// equivalent to v[IDX] || v[IDX] || v[IDX] || v[IDX]. -template -inline __m128i UnpackXMM(const __m128i& v) -{ - // Should not be instantiated - CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0); - return _mm_setzero_si128(); -} - -template <> -inline __m128i UnpackXMM<0>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3)); -} - -template <> -inline __m128i UnpackXMM<1>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7)); -} - -template <> -inline __m128i UnpackXMM<2>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11)); -} - -template <> -inline __m128i UnpackXMM<3>(const __m128i& v) -{ - return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15)); -} - -template -inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - return UnpackXMM(a, b, c, d); -} - -template -inline __m128i RepackXMM(const __m128i& v) -{ - return UnpackXMM(v); -} - -inline void SIMECK64_Encrypt(__m128i &a, __m128i &b, __m128i &c, __m128i &d, const __m128i key) -{ - // SunStudio 12.3 workaround - __m128i s, t; s = a; t = c; - a = _mm_xor_si128(_mm_and_si128(a, RotateLeft32<5>(a)), RotateLeft32<1>(a)); - c = _mm_xor_si128(_mm_and_si128(c, RotateLeft32<5>(c)), RotateLeft32<1>(c)); - a = _mm_xor_si128(a, _mm_xor_si128(b, key)); - c = _mm_xor_si128(c, _mm_xor_si128(d, key)); - b = s; d = t; -} - -inline void SIMECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... - __m128i a = UnpackXMM<0>(block0); - __m128i b = UnpackXMM<1>(block0); - __m128i c = UnpackXMM<2>(block0); - __m128i d = UnpackXMM<3>(block0); - - const unsigned int rounds = 44; - for (int i = 0; i < static_cast(rounds); i += 4) - { - const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i)); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3))); - } - - // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... - block0 = RepackXMM<0>(a,b,c,d); -} - -inline void SIMECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/) -{ - // SIMECK requires a word swap for the decryption transform - __m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1)); - - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... - __m128i a = UnpackXMM<0>(w); - __m128i b = UnpackXMM<1>(w); - __m128i c = UnpackXMM<2>(w); - __m128i d = UnpackXMM<3>(w); - - const unsigned int rounds = 44; - for (int i = static_cast(rounds)-1; i >= 0; i -= 4) - { - const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3)); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0))); - } - - // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... - w = RepackXMM<0>(a,b,c,d); - - block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1)); -} - -inline void SIMECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... - __m128i a = UnpackXMM<0>(block0, block1, block2, block3); - __m128i b = UnpackXMM<1>(block0, block1, block2, block3); - __m128i c = UnpackXMM<2>(block0, block1, block2, block3); - __m128i d = UnpackXMM<3>(block0, block1, block2, block3); - - const unsigned int rounds = 44; - for (int i = 0; i < static_cast(rounds); i += 4) - { - const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i)); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3))); - } - - // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... - block0 = RepackXMM<0>(a, b, c, d); - block1 = RepackXMM<1>(a, b, c, d); - block2 = RepackXMM<2>(a, b, c, d); - block3 = RepackXMM<3>(a, b, c, d); -} - -inline void SIMECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/) -{ - // SIMECK requires a word swap for the decryption transform - __m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i x = _mm_shuffle_epi32(block1, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i y = _mm_shuffle_epi32(block2, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i z = _mm_shuffle_epi32(block3, _MM_SHUFFLE(2, 3, 0, 1)); - - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... - __m128i a = UnpackXMM<0>(w, x, y, z); - __m128i b = UnpackXMM<1>(w, x, y, z); - __m128i c = UnpackXMM<2>(w, x, y, z); - __m128i d = UnpackXMM<3>(w, x, y, z); - - const unsigned int rounds = 44; - for (int i = static_cast(rounds)-1; i >= 0; i -= 4) - { - const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3)); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1))); - SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0))); - } - - // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... - w = RepackXMM<0>(a, b, c, d); - x = RepackXMM<1>(a, b, c, d); - y = RepackXMM<2>(a, b, c, d); - z = RepackXMM<3>(a, b, c, d); - - block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1)); - block1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)); - block2 = _mm_shuffle_epi32(y, _MM_SHUFFLE(2, 3, 0, 1)); - block3 = _mm_shuffle_epi32(z, _MM_SHUFFLE(2, 3, 0, 1)); -} - -#endif // CRYPTOPP_SSSE3_AVAILABLE - -ANONYMOUS_NAMESPACE_END - -NAMESPACE_BEGIN(CryptoPP) - -#if defined(CRYPTOPP_SSSE3_AVAILABLE) -size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Enc_Block, SIMECK64_Enc_4_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Dec_Block, SIMECK64_Dec_4_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif // CRYPTOPP_SSSE3_AVAILABLE - -NAMESPACE_END diff --git a/simon.cpp b/simon.cpp index f00ba3a0..a0eacea4 100644 --- a/simon.cpp +++ b/simon.cpp @@ -196,14 +196,6 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) -#if (CRYPTOPP_ARM_NEON_AVAILABLE) -extern size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); - -extern size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); -#endif - #if (CRYPTOPP_ARM_NEON_AVAILABLE) extern size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -212,14 +204,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, siz const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); #endif -#if (CRYPTOPP_SSE41_AVAILABLE) -extern size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); - -extern size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); -#endif - #if (CRYPTOPP_SSSE3_AVAILABLE) extern size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -228,14 +212,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); #endif -#if (CRYPTOPP_ALTIVEC_AVAILABLE) -extern size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); - -extern size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); -#endif - #if (CRYPTOPP_ALTIVEC_AVAILABLE) extern size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -246,39 +222,11 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, std::string SIMON64::Base::AlgorithmProvider() const { -#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS) -# if (CRYPTOPP_SSE41_AVAILABLE) - if (HasSSE41()) - return "SSE4.1"; -# endif -# if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - return "NEON"; -# endif -# if (CRYPTOPP_ALTIVEC_AVAILABLE) - if (HasAltivec()) - return "Altivec"; -# endif -#endif return "C++"; } unsigned int SIMON64::Base::OptimalDataAlignment() const { -#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS) -# if (CRYPTOPP_SSE41_AVAILABLE) - if (HasSSE41()) - return 16; // load __m128i -# endif -# if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - return 4; // load uint32x4_t -# endif -# if (CRYPTOPP_ALTIVEC_AVAILABLE) - if (HasAltivec()) - return 16; // load uint32x4_p -# endif -#endif return GetAlignmentOf(); } @@ -311,29 +259,6 @@ void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, default: CRYPTOPP_ASSERT(0); } - -#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS - - // Pre-splat the round keys for Altivec forward transformation -#if CRYPTOPP_ALTIVEC_AVAILABLE - if (IsForwardTransformation() && HasAltivec()) - { - AlignedSecBlock presplat(m_rkeys.size()*4); - for (size_t i=0, j=0; i Encryption; diff --git a/simon64_simd.cpp b/simon64_simd.cpp deleted file mode 100644 index 0702d782..00000000 --- a/simon64_simd.cpp +++ /dev/null @@ -1,864 +0,0 @@ -// simon_simd.cpp - written and placed in the public domain by Jeffrey Walton -// -// This source file uses intrinsics and built-ins to gain access to -// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate -// source file is needed because additional CXXFLAGS are required to enable -// the appropriate instructions sets in some build configurations. - -#include "pch.h" -#include "config.h" - -#include "simon.h" -#include "misc.h" - -// Uncomment for benchmarking C++ against SSE or NEON. -// Do so in both simon.cpp and simon_simd.cpp. -// #undef CRYPTOPP_SSE41_AVAILABLE -// #undef CRYPTOPP_ARM_NEON_AVAILABLE - -#if (CRYPTOPP_SSE41_AVAILABLE) -# include "adv_simd.h" -# include -# include -# include -#endif - -#if defined(__XOP__) -# include -# if defined(__GNUC__) -# include -# endif -#endif - -#if (CRYPTOPP_ARM_NEON_HEADER) -# include "adv_simd.h" -# include -#endif - -#if (CRYPTOPP_ARM_ACLE_HEADER) -# include -# include -#endif - -#if defined(_M_ARM64) -# include "adv_simd.h" -#endif - -#if (CRYPTOPP_ALTIVEC_AVAILABLE) -# include "adv_simd.h" -# include "ppc_simd.h" -#endif - -// Squash MS LNK4221 and libtool warnings -extern const char SIMON64_SIMD_FNAME[] = __FILE__; - -ANONYMOUS_NAMESPACE_BEGIN - -using CryptoPP::byte; -using CryptoPP::word32; -using CryptoPP::word64; -using CryptoPP::vec_swap; // SunCC - -// *************************** ARM NEON ************************** // - -#if (CRYPTOPP_ARM_NEON_AVAILABLE) - -template -inline T UnpackHigh32(const T& a, const T& b) -{ - const uint32x2_t x(vget_high_u32((uint32x4_t)a)); - const uint32x2_t y(vget_high_u32((uint32x4_t)b)); - const uint32x2x2_t r = vzip_u32(x, y); - return (T)vcombine_u32(r.val[0], r.val[1]); -} - -template -inline T UnpackLow32(const T& a, const T& b) -{ - const uint32x2_t x(vget_low_u32((uint32x4_t)a)); - const uint32x2_t y(vget_low_u32((uint32x4_t)b)); - const uint32x2x2_t r = vzip_u32(x, y); - return (T)vcombine_u32(r.val[0], r.val[1]); -} - -template -inline uint32x4_t RotateLeft32(const uint32x4_t& val) -{ - const uint32x4_t a(vshlq_n_u32(val, R)); - const uint32x4_t b(vshrq_n_u32(val, 32 - R)); - return vorrq_u32(a, b); -} - -template -inline uint32x4_t RotateRight32(const uint32x4_t& val) -{ - const uint32x4_t a(vshlq_n_u32(val, 32 - R)); - const uint32x4_t b(vshrq_n_u32(val, R)); - return vorrq_u32(a, b); -} - -#if defined(__aarch32__) || defined(__aarch64__) -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val) -{ - const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; - const uint8x16_t mask = vld1q_u8(maskb); - - return vreinterpretq_u32_u8( - vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); -} - -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -inline uint32x4_t RotateRight32<8>(const uint32x4_t& val) -{ - const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 }; - const uint8x16_t mask = vld1q_u8(maskb); - - return vreinterpretq_u32_u8( - vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); -} -#endif - -inline uint32x4_t SIMON64_f(const uint32x4_t& val) -{ - return veorq_u32(RotateLeft32<2>(val), - vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val))); -} - -inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_t x1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t y1 = vuzpq_u32(block0, block1).val[0]; - - for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) - { - const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i); - y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1); - - const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1); - x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2); - } - - if (rounds & 1) - { - const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1); - - y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk); - std::swap(x1, y1); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = UnpackLow32(y1, x1); - block1 = UnpackHigh32(y1, x1); -} - -inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_t x1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t y1 = vuzpq_u32(block0, block1).val[0]; - - if (rounds & 1) - { - std::swap(x1, y1); - const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1); - - y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1)); - rounds--; - } - - for (int i = static_cast(rounds-2); i >= 0; i -= 2) - { - const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1); - x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1); - - const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i); - y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = UnpackLow32(y1, x1); - block1 = UnpackHigh32(y1, x1); -} - -inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, - uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_t x1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t y1 = vuzpq_u32(block0, block1).val[0]; - uint32x4_t x2 = vuzpq_u32(block2, block3).val[1]; - uint32x4_t y2 = vuzpq_u32(block2, block3).val[0]; - uint32x4_t x3 = vuzpq_u32(block4, block5).val[1]; - uint32x4_t y3 = vuzpq_u32(block4, block5).val[0]; - - for (int i = 0; i < static_cast(rounds & ~1) - 1; i += 2) - { - const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i); - y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1); - y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1); - y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1); - - const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1); - x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2); - x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2); - x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2); - } - - if (rounds & 1) - { - const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1); - - y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk); - y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk); - y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk); - std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = UnpackLow32(y1, x1); - block1 = UnpackHigh32(y1, x1); - block2 = UnpackLow32(y2, x2); - block3 = UnpackHigh32(y2, x2); - block4 = UnpackLow32(y3, x3); - block5 = UnpackHigh32(y3, x3); -} - -inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, - uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_t x1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t y1 = vuzpq_u32(block0, block1).val[0]; - uint32x4_t x2 = vuzpq_u32(block2, block3).val[1]; - uint32x4_t y2 = vuzpq_u32(block2, block3).val[0]; - uint32x4_t x3 = vuzpq_u32(block4, block5).val[1]; - uint32x4_t y3 = vuzpq_u32(block4, block5).val[0]; - - if (rounds & 1) - { - std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); - const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1); - - y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1)); - y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2)); - y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3)); - rounds--; - } - - for (int i = static_cast(rounds-2); i >= 0; i -= 2) - { - const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1); - x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1); - x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1); - x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1); - - const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i); - y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2); - y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2); - y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = UnpackLow32(y1, x1); - block1 = UnpackHigh32(y1, x1); - block2 = UnpackLow32(y2, x2); - block3 = UnpackHigh32(y2, x2); - block4 = UnpackLow32(y3, x3); - block5 = UnpackHigh32(y3, x3); -} - -#endif // CRYPTOPP_ARM_NEON_AVAILABLE - -// ***************************** IA-32 ***************************** // - -#if (CRYPTOPP_SSE41_AVAILABLE) - -// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670 -#ifndef M128_CAST -# define M128_CAST(x) ((__m128i *)(void *)(x)) -#endif -#ifndef CONST_M128_CAST -# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) -#endif - -inline void Swap128(__m128i& a,__m128i& b) -{ -#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) - // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11. - // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. - vec_swap(a, b); -#else - std::swap(a, b); -#endif -} - -template -inline __m128i RotateLeft32(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, R); -#else - return _mm_or_si128( - _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); -#endif -} - -template -inline __m128i RotateRight32(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 32-R); -#else - return _mm_or_si128( - _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); -#endif -} - -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -__m128i RotateLeft32<8>(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 8); -#else - const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); - return _mm_shuffle_epi8(val, mask); -#endif -} - -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -__m128i RotateRight32<8>(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 32-8); -#else - const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); - return _mm_shuffle_epi8(val, mask); -#endif -} - -inline __m128i SIMON64_f(const __m128i& v) -{ - return _mm_xor_si128(RotateLeft32<2>(v), - _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v))); -} - -inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const __m128 t0 = _mm_castsi128_ps(block0); - const __m128 t1 = _mm_castsi128_ps(block1); - __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); - __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); - - for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) - { - // Round keys are pre-splated in forward direction - const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4)); - y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1); - - const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4)); - x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2); - } - - if (rounds & 1) - { - // Round keys are pre-splated in forward direction - const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4)); - y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk); - Swap128(x1, y1); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = _mm_unpacklo_epi32(y1, x1); - block1 = _mm_unpackhi_epi32(y1, x1); -} - -inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const __m128 t0 = _mm_castsi128_ps(block0); - const __m128 t1 = _mm_castsi128_ps(block1); - __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); - __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); - - if (rounds & 1) - { - Swap128(x1, y1); - const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); - y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1)); - rounds--; - } - - for (int i = static_cast(rounds-2); i >= 0; i -= 2) - { - const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]); - x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1); - - const __m128i rk2 = _mm_set1_epi32(subkeys[i]); - y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = _mm_unpacklo_epi32(y1, x1); - block1 = _mm_unpackhi_epi32(y1, x1); -} - -inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const __m128 t0 = _mm_castsi128_ps(block0); - const __m128 t1 = _mm_castsi128_ps(block1); - __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); - __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); - - const __m128 t2 = _mm_castsi128_ps(block2); - const __m128 t3 = _mm_castsi128_ps(block3); - __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); - __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); - - const __m128 t4 = _mm_castsi128_ps(block4); - const __m128 t5 = _mm_castsi128_ps(block5); - __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); - __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); - - for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) - { - // Round keys are pre-splated in forward direction - const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4)); - y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1); - y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1); - y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1); - - const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4)); - x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2); - x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2); - x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2); - } - - if (rounds & 1) - { - // Round keys are pre-splated in forward direction - const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4)); - y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk); - y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk); - y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk); - Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = _mm_unpacklo_epi32(y1, x1); - block1 = _mm_unpackhi_epi32(y1, x1); - block2 = _mm_unpacklo_epi32(y2, x2); - block3 = _mm_unpackhi_epi32(y2, x2); - block4 = _mm_unpacklo_epi32(y3, x3); - block5 = _mm_unpackhi_epi32(y3, x3); -} - -inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const __m128 t0 = _mm_castsi128_ps(block0); - const __m128 t1 = _mm_castsi128_ps(block1); - __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); - __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); - - const __m128 t2 = _mm_castsi128_ps(block2); - const __m128 t3 = _mm_castsi128_ps(block3); - __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); - __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); - - const __m128 t4 = _mm_castsi128_ps(block4); - const __m128 t5 = _mm_castsi128_ps(block5); - __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); - __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); - - if (rounds & 1) - { - Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); - const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); - y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1)); - y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2)); - y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3)); - rounds--; - } - - for (int i = static_cast(rounds-2); i >= 0; i -= 2) - { - const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]); - x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1); - x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1); - x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1); - - const __m128i rk2 = _mm_set1_epi32(subkeys[i]); - y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2); - y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2); - y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = _mm_unpacklo_epi32(y1, x1); - block1 = _mm_unpackhi_epi32(y1, x1); - block2 = _mm_unpacklo_epi32(y2, x2); - block3 = _mm_unpackhi_epi32(y2, x2); - block4 = _mm_unpacklo_epi32(y3, x3); - block5 = _mm_unpackhi_epi32(y3, x3); -} - -#endif // CRYPTOPP_SSE41_AVAILABLE - -// ***************************** Altivec ***************************** // - -#if (CRYPTOPP_ALTIVEC_AVAILABLE) - -using CryptoPP::uint8x16_p; -using CryptoPP::uint32x4_p; - -using CryptoPP::VecAnd; -using CryptoPP::VecXor; -using CryptoPP::VecLoad; -using CryptoPP::VecLoadAligned; -using CryptoPP::VecPermute; - -// Rotate left by bit count -template -inline uint32x4_p RotateLeft32(const uint32x4_p val) -{ - const uint32x4_p m = {C, C, C, C}; - return vec_rl(val, m); -} - -// Rotate right by bit count -template -inline uint32x4_p RotateRight32(const uint32x4_p val) -{ - const uint32x4_p m = {32-C, 32-C, 32-C, 32-C}; - return vec_rl(val, m); -} - -inline uint32x4_p SIMON64_f(const uint32x4_p val) -{ - return VecXor(RotateLeft32<2>(val), - VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val))); -} - -inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, - const word32 *subkeys, unsigned int rounds) -{ -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; - const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; -#else - const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; - const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; -#endif - - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = VecPermute(block0, block1, m1); - uint32x4_p y1 = VecPermute(block0, block1, m2); - - for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) - { - // Round keys are pre-splated in forward direction - const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4); - const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4); - - y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1); - x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2); - } - - if (rounds & 1) - { - // Round keys are pre-splated in forward direction - const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4); - - y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk); - std::swap(x1, y1); - } - -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4}; - const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12}; -#else - const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20}; - const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28}; -#endif - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)VecPermute(x1, y1, m3); - block1 = (uint32x4_p)VecPermute(x1, y1, m4); -} - -inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, - const word32 *subkeys, unsigned int rounds) -{ -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; - const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; -#else - const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; - const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; -#endif - - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = VecPermute(block0, block1, m1); - uint32x4_p y1 = VecPermute(block0, block1, m2); - - if (rounds & 1) - { - std::swap(x1, y1); -#if defined(_ARCH_PWR7) - const uint32x4_p rk = vec_splats(subkeys[rounds-1]); -#else - const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VecLoad(subkeys+rounds-1); - rk = VecPermute(rk, rk, m); -#endif - y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1)); - rounds--; - } - - for (int i = static_cast(rounds-2); i >= 0; i -= 2) - { -#if defined(_ARCH_PWR7) - const uint32x4_p rk1 = vec_splats(subkeys[i+1]); - const uint32x4_p rk2 = vec_splats(subkeys[i]); -#else - const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk1 = VecLoad(subkeys+i+1); - uint32x4_p rk2 = VecLoad(subkeys+i); - rk1 = VecPermute(rk1, rk1, m); - rk2 = VecPermute(rk2, rk2, m); -#endif - x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1); - y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2); - } - -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4}; - const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12}; -#else - const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20}; - const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28}; -#endif - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)VecPermute(x1, y1, m3); - block1 = (uint32x4_p)VecPermute(x1, y1, m4); -} - -inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, - uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, - uint32x4_p &block5, const word32 *subkeys, unsigned int rounds) -{ -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; - const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; -#else - const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; - const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; -#endif - - // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1); - uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2); - uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1); - uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2); - uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1); - uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2); - - for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) - { - // Round keys are pre-splated in forward direction - const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4); - const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4); - - y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1); - y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1); - y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1); - - x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2); - x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2); - x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2); - } - - if (rounds & 1) - { - // Round keys are pre-splated in forward direction - const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4); - - y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk); - y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk); - y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk); - std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); - } - -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4}; - const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12}; -#else - const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20}; - const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28}; -#endif - - // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block0 = (uint32x4_p)VecPermute(x1, y1, m3); - block1 = (uint32x4_p)VecPermute(x1, y1, m4); - block2 = (uint32x4_p)VecPermute(x2, y2, m3); - block3 = (uint32x4_p)VecPermute(x2, y2, m4); - block4 = (uint32x4_p)VecPermute(x3, y3, m3); - block5 = (uint32x4_p)VecPermute(x3, y3, m4); -} - -inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, - uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, - uint32x4_p &block5, const word32 *subkeys, unsigned int rounds) -{ -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; - const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; -#else - const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; - const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; -#endif - - // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1); - uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2); - uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1); - uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2); - uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1); - uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2); - - if (rounds & 1) - { - std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); -#if defined(_ARCH_PWR7) - const uint32x4_p rk = vec_splats(subkeys[rounds-1]); -#else - const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VecLoad(subkeys+rounds-1); - rk = VecPermute(rk, rk, m); -#endif - y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1)); - y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2)); - y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3)); - rounds--; - } - - for (int i = static_cast(rounds-2); i >= 0; i -= 2) - { -#if defined(_ARCH_PWR7) - const uint32x4_p rk1 = vec_splats(subkeys[i+1]); - const uint32x4_p rk2 = vec_splats(subkeys[i]); -#else - const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk1 = VecLoad(subkeys+i+1); - uint32x4_p rk2 = VecLoad(subkeys+i); - rk1 = VecPermute(rk1, rk1, m); - rk2 = VecPermute(rk2, rk2, m); -#endif - x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1); - x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1); - x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1); - - y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2); - y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2); - y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2); - } - -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4}; - const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12}; -#else - const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20}; - const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28}; -#endif - - // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block0 = (uint32x4_p)VecPermute(x1, y1, m3); - block1 = (uint32x4_p)VecPermute(x1, y1, m4); - block2 = (uint32x4_p)VecPermute(x2, y2, m3); - block3 = (uint32x4_p)VecPermute(x2, y2, m4); - block4 = (uint32x4_p)VecPermute(x3, y3, m3); - block5 = (uint32x4_p)VecPermute(x3, y3, m4); -} - -#endif // CRYPTOPP_ALTIVEC_AVAILABLE - -ANONYMOUS_NAMESPACE_END - -/////////////////////////////////////////////////////////////////////// - -NAMESPACE_BEGIN(CryptoPP) - -// *************************** ARM NEON **************************** // - -#if (CRYPTOPP_ARM_NEON_AVAILABLE) -size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif // CRYPTOPP_ARM_NEON_AVAILABLE - -// ***************************** IA-32 ***************************** // - -#if (CRYPTOPP_SSE41_AVAILABLE) -size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif - -// ***************************** Altivec ***************************** // - -#if (CRYPTOPP_ALTIVEC_AVAILABLE) -size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif - -NAMESPACE_END diff --git a/speck.cpp b/speck.cpp index 879dfc16..00b960d7 100644 --- a/speck.cpp +++ b/speck.cpp @@ -171,12 +171,6 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) #if (CRYPTOPP_ARM_NEON_AVAILABLE) -extern size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); - -extern size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); - extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -200,14 +194,6 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); #endif -#if (CRYPTOPP_ALTIVEC_AVAILABLE) -extern size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); - -extern size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); -#endif - #if (CRYPTOPP_ALTIVEC_AVAILABLE) extern size_t SPECK128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -218,39 +204,11 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, std::string SPECK64::Base::AlgorithmProvider() const { -#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS) -# if (CRYPTOPP_SSE41_AVAILABLE) - if (HasSSE41()) - return "SSE4.1"; -# endif -# if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - return "NEON"; -# endif -# if (CRYPTOPP_ALTIVEC_AVAILABLE) - if (HasAltivec()) - return "Altivec"; -# endif -#endif return "C++"; } unsigned int SPECK64::Base::OptimalDataAlignment() const { -#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS) -# if (CRYPTOPP_SSE41_AVAILABLE) - if (HasSSE41()) - return 16; // load __m128i -# endif -# if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - return 4; // load uint32x4_t -# endif -# if (CRYPTOPP_ALTIVEC_AVAILABLE) - if (HasAltivec()) - return 16; // load uint32x4_p -# endif -#endif return GetAlignmentOf(); } @@ -283,29 +241,6 @@ void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, default: CRYPTOPP_ASSERT(0); } - -#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS - - // Pre-splat the round keys for Altivec forward transformation -#if CRYPTOPP_ALTIVEC_AVAILABLE - if (IsForwardTransformation() && HasAltivec()) - { - AlignedSecBlock presplat(m_rkeys.size()*4); - for (size_t i=0, j=0; i Encryption; diff --git a/speck64_simd.cpp b/speck64_simd.cpp deleted file mode 100644 index 204c0dca..00000000 --- a/speck64_simd.cpp +++ /dev/null @@ -1,781 +0,0 @@ -// speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton -// -// This source file uses intrinsics and built-ins to gain access to -// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate -// source file is needed because additional CXXFLAGS are required to enable -// the appropriate instructions sets in some build configurations. - -#include "pch.h" -#include "config.h" - -#include "speck.h" -#include "misc.h" - -// Uncomment for benchmarking C++ against SSE or NEON. -// Do so in both speck.cpp and speck_simd.cpp. -// #undef CRYPTOPP_SSE41_AVAILABLE -// #undef CRYPTOPP_ARM_NEON_AVAILABLE - -#if (CRYPTOPP_SSE41_AVAILABLE) -# include "adv_simd.h" -# include -# include -# include -#endif - -#if defined(__XOP__) -# include -# if defined(__GNUC__) -# include -# endif -#endif - -#if (CRYPTOPP_ARM_NEON_HEADER) -# include "adv_simd.h" -# include -#endif - -#if (CRYPTOPP_ARM_ACLE_HEADER) -# include -# include -#endif - -#if defined(_M_ARM64) -# include "adv_simd.h" -#endif - -#if (CRYPTOPP_ALTIVEC_AVAILABLE) -# include "adv_simd.h" -# include "ppc_simd.h" -#endif - -// Squash MS LNK4221 and libtool warnings -extern const char SPECK64_SIMD_FNAME[] = __FILE__; - -ANONYMOUS_NAMESPACE_BEGIN - -using CryptoPP::byte; -using CryptoPP::word32; -using CryptoPP::word64; - -// *************************** ARM NEON ************************** // - -#if (CRYPTOPP_ARM_NEON_AVAILABLE) - -template -inline T UnpackHigh32(const T& a, const T& b) -{ - const uint32x2_t x(vget_high_u32((uint32x4_t)a)); - const uint32x2_t y(vget_high_u32((uint32x4_t)b)); - const uint32x2x2_t r = vzip_u32(x, y); - return (T)vcombine_u32(r.val[0], r.val[1]); -} - -template -inline T UnpackLow32(const T& a, const T& b) -{ - const uint32x2_t x(vget_low_u32((uint32x4_t)a)); - const uint32x2_t y(vget_low_u32((uint32x4_t)b)); - const uint32x2x2_t r = vzip_u32(x, y); - return (T)vcombine_u32(r.val[0], r.val[1]); -} - -template -inline uint32x4_t RotateLeft32(const uint32x4_t& val) -{ - const uint32x4_t a(vshlq_n_u32(val, R)); - const uint32x4_t b(vshrq_n_u32(val, 32 - R)); - return vorrq_u32(a, b); -} - -template -inline uint32x4_t RotateRight32(const uint32x4_t& val) -{ - const uint32x4_t a(vshlq_n_u32(val, 32 - R)); - const uint32x4_t b(vshrq_n_u32(val, R)); - return vorrq_u32(a, b); -} - -#if defined(__aarch32__) || defined(__aarch64__) -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val) -{ - const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; - const uint8x16_t mask = vld1q_u8(maskb); - - return vreinterpretq_u32_u8( - vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); -} - -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -inline uint32x4_t RotateRight32<8>(const uint32x4_t& val) -{ - const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 }; - const uint8x16_t mask = vld1q_u8(maskb); - - return vreinterpretq_u32_u8( - vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); -} -#endif // Aarch32 or Aarch64 - -inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_t x1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t y1 = vuzpq_u32(block0, block1).val[0]; - - for (size_t i=0; i < static_cast(rounds); ++i) - { - const uint32x4_t rk = vdupq_n_u32(subkeys[i]); - - x1 = RotateRight32<8>(x1); - x1 = vaddq_u32(x1, y1); - x1 = veorq_u32(x1, rk); - y1 = RotateLeft32<3>(y1); - y1 = veorq_u32(y1, x1); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = UnpackLow32(y1, x1); - block1 = UnpackHigh32(y1, x1); -} - -inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_t x1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t y1 = vuzpq_u32(block0, block1).val[0]; - - for (int i = static_cast(rounds-1); i >= 0; --i) - { - const uint32x4_t rk = vdupq_n_u32(subkeys[i]); - - y1 = veorq_u32(y1, x1); - y1 = RotateRight32<3>(y1); - x1 = veorq_u32(x1, rk); - x1 = vsubq_u32(x1, y1); - x1 = RotateLeft32<8>(x1); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = UnpackLow32(y1, x1); - block1 = UnpackHigh32(y1, x1); -} - -inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, - uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_t x1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t y1 = vuzpq_u32(block0, block1).val[0]; - uint32x4_t x2 = vuzpq_u32(block2, block3).val[1]; - uint32x4_t y2 = vuzpq_u32(block2, block3).val[0]; - uint32x4_t x3 = vuzpq_u32(block4, block5).val[1]; - uint32x4_t y3 = vuzpq_u32(block4, block5).val[0]; - - for (size_t i=0; i < static_cast(rounds); ++i) - { - const uint32x4_t rk = vdupq_n_u32(subkeys[i]); - - x1 = RotateRight32<8>(x1); - x2 = RotateRight32<8>(x2); - x3 = RotateRight32<8>(x3); - x1 = vaddq_u32(x1, y1); - x2 = vaddq_u32(x2, y2); - x3 = vaddq_u32(x3, y3); - x1 = veorq_u32(x1, rk); - x2 = veorq_u32(x2, rk); - x3 = veorq_u32(x3, rk); - y1 = RotateLeft32<3>(y1); - y2 = RotateLeft32<3>(y2); - y3 = RotateLeft32<3>(y3); - y1 = veorq_u32(y1, x1); - y2 = veorq_u32(y2, x2); - y3 = veorq_u32(y3, x3); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = UnpackLow32(y1, x1); - block1 = UnpackHigh32(y1, x1); - block2 = UnpackLow32(y2, x2); - block3 = UnpackHigh32(y2, x2); - block4 = UnpackLow32(y3, x3); - block5 = UnpackHigh32(y3, x3); -} - -inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, - uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_t x1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t y1 = vuzpq_u32(block0, block1).val[0]; - uint32x4_t x2 = vuzpq_u32(block2, block3).val[1]; - uint32x4_t y2 = vuzpq_u32(block2, block3).val[0]; - uint32x4_t x3 = vuzpq_u32(block4, block5).val[1]; - uint32x4_t y3 = vuzpq_u32(block4, block5).val[0]; - - for (int i = static_cast(rounds-1); i >= 0; --i) - { - const uint32x4_t rk = vdupq_n_u32(subkeys[i]); - - y1 = veorq_u32(y1, x1); - y2 = veorq_u32(y2, x2); - y3 = veorq_u32(y3, x3); - y1 = RotateRight32<3>(y1); - y2 = RotateRight32<3>(y2); - y3 = RotateRight32<3>(y3); - x1 = veorq_u32(x1, rk); - x2 = veorq_u32(x2, rk); - x3 = veorq_u32(x3, rk); - x1 = vsubq_u32(x1, y1); - x2 = vsubq_u32(x2, y2); - x3 = vsubq_u32(x3, y3); - x1 = RotateLeft32<8>(x1); - x2 = RotateLeft32<8>(x2); - x3 = RotateLeft32<8>(x3); - } - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = UnpackLow32(y1, x1); - block1 = UnpackHigh32(y1, x1); - block2 = UnpackLow32(y2, x2); - block3 = UnpackHigh32(y2, x2); - block4 = UnpackLow32(y3, x3); - block5 = UnpackHigh32(y3, x3); -} - -#endif // CRYPTOPP_ARM_NEON_AVAILABLE - -// ***************************** IA-32 ***************************** // - -#if (CRYPTOPP_SSE41_AVAILABLE) - -#ifndef M128_CAST -# define M128_CAST(x) ((__m128i *)(void *)(x)) -#endif -#ifndef CONST_M128_CAST -# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) -#endif - -template -inline __m128i RotateLeft32(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, R); -#else - return _mm_or_si128( - _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); -#endif -} - -template -inline __m128i RotateRight32(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 32-R); -#else - return _mm_or_si128( - _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); -#endif -} - -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -__m128i RotateLeft32<8>(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 8); -#else - const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); - return _mm_shuffle_epi8(val, mask); -#endif -} - -// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. -template <> -__m128i RotateRight32<8>(const __m128i& val) -{ -#if defined(__XOP__) - return _mm_roti_epi32(val, 32-8); -#else - const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); - return _mm_shuffle_epi8(val, mask); -#endif -} - -inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const __m128 t0 = _mm_castsi128_ps(block0); - const __m128 t1 = _mm_castsi128_ps(block1); - __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); - __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); - - for (size_t i=0; i < static_cast(rounds); ++i) - { - // Round keys are pre-splated in forward direction - const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4)); - - x1 = RotateRight32<8>(x1); - x1 = _mm_add_epi32(x1, y1); - x1 = _mm_xor_si128(x1, rk); - y1 = RotateLeft32<3>(y1); - y1 = _mm_xor_si128(y1, x1); - } - - // The is roughly the SSE equivalent to ARM vzp32 - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = _mm_unpacklo_epi32(y1, x1); - block1 = _mm_unpackhi_epi32(y1, x1); -} - -inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const __m128 t0 = _mm_castsi128_ps(block0); - const __m128 t1 = _mm_castsi128_ps(block1); - __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); - __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); - - for (int i = static_cast(rounds-1); i >= 0; --i) - { - const __m128i rk = _mm_set1_epi32(subkeys[i]); - - y1 = _mm_xor_si128(y1, x1); - y1 = RotateRight32<3>(y1); - x1 = _mm_xor_si128(x1, rk); - x1 = _mm_sub_epi32(x1, y1); - x1 = RotateLeft32<8>(x1); - } - - // The is roughly the SSE equivalent to ARM vzp32 - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = _mm_unpacklo_epi32(y1, x1); - block1 = _mm_unpackhi_epi32(y1, x1); -} - -inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const __m128 t0 = _mm_castsi128_ps(block0); - const __m128 t1 = _mm_castsi128_ps(block1); - __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); - __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); - - const __m128 t2 = _mm_castsi128_ps(block2); - const __m128 t3 = _mm_castsi128_ps(block3); - __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); - __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); - - const __m128 t4 = _mm_castsi128_ps(block4); - const __m128 t5 = _mm_castsi128_ps(block5); - __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); - __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); - - for (size_t i=0; i < static_cast(rounds); ++i) - { - // Round keys are pre-splated in forward direction - const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4)); - - x1 = RotateRight32<8>(x1); - x2 = RotateRight32<8>(x2); - x3 = RotateRight32<8>(x3); - x1 = _mm_add_epi32(x1, y1); - x2 = _mm_add_epi32(x2, y2); - x3 = _mm_add_epi32(x3, y3); - x1 = _mm_xor_si128(x1, rk); - x2 = _mm_xor_si128(x2, rk); - x3 = _mm_xor_si128(x3, rk); - y1 = RotateLeft32<3>(y1); - y2 = RotateLeft32<3>(y2); - y3 = RotateLeft32<3>(y3); - y1 = _mm_xor_si128(y1, x1); - y2 = _mm_xor_si128(y2, x2); - y3 = _mm_xor_si128(y3, x3); - } - - // The is roughly the SSE equivalent to ARM vzp32 - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = _mm_unpacklo_epi32(y1, x1); - block1 = _mm_unpackhi_epi32(y1, x1); - block2 = _mm_unpacklo_epi32(y2, x2); - block3 = _mm_unpackhi_epi32(y2, x2); - block4 = _mm_unpacklo_epi32(y3, x3); - block5 = _mm_unpackhi_epi32(y3, x3); -} - -inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, - const word32 *subkeys, unsigned int rounds) -{ - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const __m128 t0 = _mm_castsi128_ps(block0); - const __m128 t1 = _mm_castsi128_ps(block1); - __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); - __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); - - const __m128 t2 = _mm_castsi128_ps(block2); - const __m128 t3 = _mm_castsi128_ps(block3); - __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); - __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); - - const __m128 t4 = _mm_castsi128_ps(block4); - const __m128 t5 = _mm_castsi128_ps(block5); - __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); - __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); - - for (int i = static_cast(rounds-1); i >= 0; --i) - { - const __m128i rk = _mm_set1_epi32(subkeys[i]); - - y1 = _mm_xor_si128(y1, x1); - y2 = _mm_xor_si128(y2, x2); - y3 = _mm_xor_si128(y3, x3); - y1 = RotateRight32<3>(y1); - y2 = RotateRight32<3>(y2); - y3 = RotateRight32<3>(y3); - x1 = _mm_xor_si128(x1, rk); - x2 = _mm_xor_si128(x2, rk); - x3 = _mm_xor_si128(x3, rk); - x1 = _mm_sub_epi32(x1, y1); - x2 = _mm_sub_epi32(x2, y2); - x3 = _mm_sub_epi32(x3, y3); - x1 = RotateLeft32<8>(x1); - x2 = RotateLeft32<8>(x2); - x3 = RotateLeft32<8>(x3); - } - - // The is roughly the SSE equivalent to ARM vzp32 - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = _mm_unpacklo_epi32(y1, x1); - block1 = _mm_unpackhi_epi32(y1, x1); - block2 = _mm_unpacklo_epi32(y2, x2); - block3 = _mm_unpackhi_epi32(y2, x2); - block4 = _mm_unpacklo_epi32(y3, x3); - block5 = _mm_unpackhi_epi32(y3, x3); -} - -#endif // CRYPTOPP_SSE41_AVAILABLE - -// ***************************** Altivec ***************************** // - -#if (CRYPTOPP_ALTIVEC_AVAILABLE) -using CryptoPP::uint8x16_p; -using CryptoPP::uint32x4_p; - -using CryptoPP::VecAdd; -using CryptoPP::VecSub; -using CryptoPP::VecXor; -using CryptoPP::VecLoad; -using CryptoPP::VecLoadAligned; -using CryptoPP::VecPermute; - -// Rotate left by bit count -template -inline uint32x4_p RotateLeft32(const uint32x4_p val) -{ - const uint32x4_p m = {C, C, C, C}; - return vec_rl(val, m); -} - -// Rotate right by bit count -template -inline uint32x4_p RotateRight32(const uint32x4_p val) -{ - const uint32x4_p m = {32-C, 32-C, 32-C, 32-C}; - return vec_rl(val, m); -} - -void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, - const word32 *subkeys, unsigned int rounds) -{ -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; - const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; -#else - const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; - const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; -#endif - - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = VecPermute(block0, block1, m1); - uint32x4_p y1 = VecPermute(block0, block1, m2); - - for (size_t i=0; i < static_cast(rounds); ++i) - { - // Round keys are pre-splated in forward direction - const uint32x4_p rk = VecLoadAligned(subkeys+i*4); - - x1 = RotateRight32<8>(x1); - x1 = VecAdd(x1, y1); - x1 = VecXor(x1, rk); - - y1 = RotateLeft32<3>(y1); - y1 = VecXor(y1, x1); - } - -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4}; - const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12}; -#else - const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20}; - const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28}; -#endif - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)VecPermute(x1, y1, m3); - block1 = (uint32x4_p)VecPermute(x1, y1, m4); -} - -void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, - const word32 *subkeys, unsigned int rounds) -{ -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; - const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; -#else - const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; - const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; -#endif - - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = VecPermute(block0, block1, m1); - uint32x4_p y1 = VecPermute(block0, block1, m2); - - for (int i = static_cast(rounds-1); i >= 0; --i) - { -#if defined(_ARCH_PWR7) - const uint32x4_p rk = vec_splats(subkeys[i]); -#else - // subkeys has extra elements so memory backs the last subkey - const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VecLoad(subkeys+i); - rk = VecPermute(rk, rk, m); -#endif - - y1 = VecXor(y1, x1); - y1 = RotateRight32<3>(y1); - - x1 = VecXor(x1, rk); - x1 = VecSub(x1, y1); - x1 = RotateLeft32<8>(x1); - } - -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4}; - const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12}; -#else - const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20}; - const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28}; -#endif - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)VecPermute(x1, y1, m3); - block1 = (uint32x4_p)VecPermute(x1, y1, m4); -} - -void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, - uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, - uint32x4_p &block5, const word32 *subkeys, unsigned int rounds) -{ -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; - const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; -#else - const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; - const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; -#endif - - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1); - uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2); - uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1); - uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2); - uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1); - uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2); - - for (size_t i=0; i < static_cast(rounds); ++i) - { - // Round keys are pre-splated in forward direction - const uint32x4_p rk = VecLoadAligned(subkeys+i*4); - - x1 = RotateRight32<8>(x1); - x2 = RotateRight32<8>(x2); - x3 = RotateRight32<8>(x3); - - x1 = VecAdd(x1, y1); - x2 = VecAdd(x2, y2); - x3 = VecAdd(x3, y3); - - x1 = VecXor(x1, rk); - x2 = VecXor(x2, rk); - x3 = VecXor(x3, rk); - - y1 = RotateLeft32<3>(y1); - y2 = RotateLeft32<3>(y2); - y3 = RotateLeft32<3>(y3); - - y1 = VecXor(y1, x1); - y2 = VecXor(y2, x2); - y3 = VecXor(y3, x3); - } - -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4}; - const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12}; -#else - const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20}; - const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28}; -#endif - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)VecPermute(x1, y1, m3); - block1 = (uint32x4_p)VecPermute(x1, y1, m4); - block2 = (uint32x4_p)VecPermute(x2, y2, m3); - block3 = (uint32x4_p)VecPermute(x2, y2, m4); - block4 = (uint32x4_p)VecPermute(x3, y3, m3); - block5 = (uint32x4_p)VecPermute(x3, y3, m4); -} - -void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, - uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, - uint32x4_p &block5, const word32 *subkeys, unsigned int rounds) -{ -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; - const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; -#else - const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24}; - const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28}; -#endif - - // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1); - uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2); - uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1); - uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2); - uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1); - uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2); - - for (int i = static_cast(rounds-1); i >= 0; --i) - { -#if defined(_ARCH_PWR7) - const uint32x4_p rk = vec_splats(subkeys[i]); -#else - // subkeys has extra elements so memory backs the last subkey - const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VecLoad(subkeys+i); - rk = VecPermute(rk, rk, m); -#endif - - y1 = VecXor(y1, x1); - y2 = VecXor(y2, x2); - y3 = VecXor(y3, x3); - - y1 = RotateRight32<3>(y1); - y2 = RotateRight32<3>(y2); - y3 = RotateRight32<3>(y3); - - x1 = VecXor(x1, rk); - x2 = VecXor(x2, rk); - x3 = VecXor(x3, rk); - - x1 = VecSub(x1, y1); - x2 = VecSub(x2, y2); - x3 = VecSub(x3, y3); - - x1 = RotateLeft32<8>(x1); - x2 = RotateLeft32<8>(x2); - x3 = RotateLeft32<8>(x3); - } - -#if (CRYPTOPP_BIG_ENDIAN) - const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4}; - const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12}; -#else - const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20}; - const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28}; -#endif - - // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)VecPermute(x1, y1, m3); - block1 = (uint32x4_p)VecPermute(x1, y1, m4); - block2 = (uint32x4_p)VecPermute(x2, y2, m3); - block3 = (uint32x4_p)VecPermute(x2, y2, m4); - block4 = (uint32x4_p)VecPermute(x3, y3, m3); - block5 = (uint32x4_p)VecPermute(x3, y3, m4); -} - -#endif // CRYPTOPP_ALTIVEC_AVAILABLE - -ANONYMOUS_NAMESPACE_END - -/////////////////////////////////////////////////////////////////////// - -NAMESPACE_BEGIN(CryptoPP) - -// *************************** ARM NEON **************************** // - -#if (CRYPTOPP_ARM_NEON_AVAILABLE) -size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif - -// ***************************** IA-32 ***************************** // - -#if (CRYPTOPP_SSE41_AVAILABLE) -size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif - -// ***************************** Altivec ***************************** // - -#if (CRYPTOPP_ALTIVEC_AVAILABLE) -size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} - -size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks, - subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif - -NAMESPACE_END