mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 01:49:41 +00:00
Remove 64-bit AdvancedProcessBlocks (GH #945)
This commit is contained in:
parent
84ab419029
commit
dd7598e638
@ -334,10 +334,8 @@ simple.cpp
|
||||
simple.h
|
||||
siphash.h
|
||||
simeck.cpp
|
||||
simeck_simd.cpp
|
||||
simeck.h
|
||||
simon.cpp
|
||||
simon64_simd.cpp
|
||||
simon128_simd.cpp
|
||||
simon.h
|
||||
skipjack.cpp
|
||||
@ -351,7 +349,6 @@ smartptr.h
|
||||
sosemanuk.cpp
|
||||
sosemanuk.h
|
||||
speck.cpp
|
||||
speck64_simd.cpp
|
||||
speck128_simd.cpp
|
||||
speck.h
|
||||
square.cpp
|
||||
|
24
GNUmakefile
24
GNUmakefile
@ -292,7 +292,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CHAM_FLAG = $(SSSE3_FLAG)
|
||||
KECCAK_FLAG = $(SSSE3_FLAG)
|
||||
LEA_FLAG = $(SSSE3_FLAG)
|
||||
SIMECK_FLAG = $(SSSE3_FLAG)
|
||||
SIMON128_FLAG = $(SSSE3_FLAG)
|
||||
SPECK128_FLAG = $(SSSE3_FLAG)
|
||||
SUN_LDFLAGS += $(SSSE3_FLAG)
|
||||
@ -306,8 +305,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
BLAKE2B_FLAG = $(SSE41_FLAG)
|
||||
BLAKE2S_FLAG = $(SSE41_FLAG)
|
||||
SIMON64_FLAG = $(SSE41_FLAG)
|
||||
SPECK64_FLAG = $(SSE41_FLAG)
|
||||
SUN_LDFLAGS += $(SSE41_FLAG)
|
||||
else
|
||||
SSE41_FLAG =
|
||||
@ -478,10 +475,7 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CHAM_FLAG = -march=armv7-a -mfpu=neon
|
||||
LEA_FLAG = -march=armv7-a -mfpu=neon
|
||||
SHA_FLAG = -march=armv7-a -mfpu=neon
|
||||
SIMECK_FLAG = -march=armv7-a -mfpu=neon
|
||||
SIMON64_FLAG = -march=armv7-a -mfpu=neon
|
||||
SIMON128_FLAG = -march=armv7-a -mfpu=neon
|
||||
SPECK64_FLAG = -march=armv7-a -mfpu=neon
|
||||
SPECK128_FLAG = -march=armv7-a -mfpu=neon
|
||||
SM4_FLAG = -march=armv7-a -mfpu=neon
|
||||
else
|
||||
@ -521,10 +515,7 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CHAM_FLAG = -march=armv8-a
|
||||
LEA_FLAG = -march=armv8-a
|
||||
NEON_FLAG = -march=armv8-a
|
||||
SIMECK_FLAG = -march=armv8-a
|
||||
SIMON64_FLAG = -march=armv8-a
|
||||
SIMON128_FLAG = -march=armv8-a
|
||||
SPECK64_FLAG = -march=armv8-a
|
||||
SPECK128_FLAG = -march=armv8-a
|
||||
SM4_FLAG = -march=armv8-a
|
||||
else
|
||||
@ -658,7 +649,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
LEA_FLAG = $(POWER8_FLAG)
|
||||
SHA_FLAG = $(POWER8_FLAG)
|
||||
SHACAL2_FLAG = $(POWER8_FLAG)
|
||||
SIMECK_FLAG = $(POWER8_FLAG)
|
||||
else
|
||||
POWER8_FLAG =
|
||||
endif
|
||||
@ -724,8 +714,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
ifneq ($(ALTIVEC_FLAG),)
|
||||
BLAKE2S_FLAG = $(ALTIVEC_FLAG)
|
||||
CHACHA_FLAG = $(ALTIVEC_FLAG)
|
||||
SIMON64_FLAG = $(ALTIVEC_FLAG)
|
||||
SPECK64_FLAG = $(ALTIVEC_FLAG)
|
||||
SPECK128_FLAG = $(ALTIVEC_FLAG)
|
||||
SIMON128_FLAG = $(ALTIVEC_FLAG)
|
||||
endif
|
||||
@ -1612,22 +1600,10 @@ sha3_simd.o : sha3_simd.cpp
|
||||
shacal2_simd.o : shacal2_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SHA_FLAG) -c) $<
|
||||
|
||||
# SSSE3 or NEON available
|
||||
simeck_simd.o : simeck_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMECK_FLAG) -c) $<
|
||||
|
||||
# SSE4.1, NEON or POWER7 available
|
||||
simon64_simd.o : simon64_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON64_FLAG) -c) $<
|
||||
|
||||
# SSSE3, NEON or POWER8 available
|
||||
simon128_simd.o : simon128_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON128_FLAG) -c) $<
|
||||
|
||||
# SSE4.1, NEON or POWER7 available
|
||||
speck64_simd.o : speck64_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK64_FLAG) -c) $<
|
||||
|
||||
# SSSE3, NEON or POWER8 available
|
||||
speck128_simd.o : speck128_simd.cpp
|
||||
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK128_FLAG) -c) $<
|
||||
|
@ -241,7 +241,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
ARIA_FLAG = $(SSSE3_FLAG)
|
||||
CHAM_FLAG = $(SSSE3_FLAG)
|
||||
LEA_FLAG = $(SSSE3_FLAG)
|
||||
SIMECK_FLAG = $(SSSE3_FLAG)
|
||||
SIMON128_FLAG = $(SSSE3_FLAG)
|
||||
SPECK128_FLAG = $(SSSE3_FLAG)
|
||||
else
|
||||
@ -254,8 +253,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
ifeq ($(strip $(HAVE_OPT)),0)
|
||||
BLAKE2B_FLAG = $(SSE41_FLAG)
|
||||
BLAKE2S_FLAG = $(SSE41_FLAG)
|
||||
SIMON64_FLAG = $(SSE41_FLAG)
|
||||
SPECK64_FLAG = $(SSE41_FLAG)
|
||||
else
|
||||
SSE41_FLAG =
|
||||
endif
|
||||
@ -400,10 +397,7 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CHAM_FLAG = $(NEON_FLAG)
|
||||
LEA_FLAG = $(NEON_FLAG)
|
||||
SHA_FLAG = $(NEON_FLAG)
|
||||
SIMECK_FLAG = $(NEON_FLAG)
|
||||
SIMON64_FLAG = $(NEON_FLAG)
|
||||
SIMON128_FLAG = $(NEON_FLAG)
|
||||
SPECK64_FLAG = $(NEON_FLAG)
|
||||
SPECK128_FLAG = $(NEON_FLAG)
|
||||
SM4_FLAG = $(NEON_FLAG)
|
||||
else
|
||||
@ -457,10 +451,7 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
CHAM_FLAG = $(ASIMD_FLAG)
|
||||
LEA_FLAG = $(ASIMD_FLAG)
|
||||
NEON_FLAG = $(ASIMD_FLAG)
|
||||
SIMECK_FLAG = $(ASIMD_FLAG)
|
||||
SIMON64_FLAG = $(ASIMD_FLAG)
|
||||
SIMON128_FLAG = $(ASIMD_FLAG)
|
||||
SPECK64_FLAG = $(ASIMD_FLAG)
|
||||
SPECK128_FLAG = $(ASIMD_FLAG)
|
||||
SM4_FLAG = $(ASIMD_FLAG)
|
||||
else
|
||||
@ -933,22 +924,10 @@ sha512_armv4.o : sha512_armv4.S
|
||||
shacal2_simd.o : shacal2_simd.cpp
|
||||
$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
|
||||
|
||||
# SSSE3 or NEON available
|
||||
simeck_simd.o : simeck_simd.cpp
|
||||
$(CXX) $(strip $(CXXFLAGS) $(SIMECK_FLAG) -c) $<
|
||||
|
||||
# SSE4.1, NEON or POWER7 available
|
||||
simon64_simd.o : simon64_simd.cpp
|
||||
$(CXX) $(strip $(CXXFLAGS) $(SIMON64_FLAG) -c) $<
|
||||
|
||||
# SSSE3, NEON or POWER8 available
|
||||
simon128_simd.o : simon128_simd.cpp
|
||||
$(CXX) $(strip $(CXXFLAGS) $(SIMON128_FLAG) -c) $<
|
||||
|
||||
# SSE4.1, NEON or POWER7 available
|
||||
speck64_simd.o : speck64_simd.cpp
|
||||
$(CXX) $(strip $(CXXFLAGS) $(SPECK64_FLAG) -c) $<
|
||||
|
||||
# SSSE3, NEON or POWER8 available
|
||||
speck128_simd.o : speck128_simd.cpp
|
||||
$(CXX) $(strip $(CXXFLAGS) $(SPECK128_FLAG) -c) $<
|
||||
|
1116
adv_simd.h
1116
adv_simd.h
File diff suppressed because it is too large
Load Diff
34
cham.cpp
34
cham.cpp
@ -96,7 +96,7 @@ ANONYMOUS_NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
|
||||
# if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
extern size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
@ -110,11 +110,11 @@ extern size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, siz
|
||||
extern size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
# endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
#endif // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
#endif // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
std::string CHAM64::Base::AlgorithmProvider() const
|
||||
{
|
||||
#if (CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS)
|
||||
#if (CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS)
|
||||
# if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3())
|
||||
return "SSSE3";
|
||||
@ -336,31 +336,7 @@ void CHAM128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock,
|
||||
oblock(m_x[0])(m_x[1])(m_x[2])(m_x[3]);
|
||||
}
|
||||
|
||||
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
size_t CHAM64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
# if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3()) {
|
||||
return CHAM64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, 80,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
# endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t CHAM64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
# if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3()) {
|
||||
return CHAM64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, 80,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
# endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
|
||||
size_t CHAM128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
@ -386,6 +362,6 @@ size_t CHAM128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor
|
||||
# endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
#endif // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
NAMESPACE_END
|
||||
|
19
cham.h
19
cham.h
@ -16,18 +16,15 @@
|
||||
#include "algparam.h"
|
||||
|
||||
#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
|
||||
# define CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS 1
|
||||
# define CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS 1
|
||||
#endif
|
||||
|
||||
// Yet another SunStudio/SunCC workaround. Failed self tests
|
||||
// in SSE code paths on i386 for SunStudio 12.3 and below.
|
||||
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
|
||||
# undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
# undef CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
|
||||
#endif
|
||||
|
||||
// https://github.com/weidai11/cryptopp/issues/945
|
||||
#undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
/// \brief CHAM block cipher information
|
||||
@ -92,10 +89,6 @@ public:
|
||||
{
|
||||
public:
|
||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||
|
||||
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||
#endif
|
||||
};
|
||||
|
||||
/// \brief Decryption transformation
|
||||
@ -106,10 +99,6 @@ public:
|
||||
{
|
||||
public:
|
||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||
|
||||
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||
#endif
|
||||
};
|
||||
|
||||
/// \brief CHAM64 encryption
|
||||
@ -156,7 +145,7 @@ public:
|
||||
public:
|
||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||
|
||||
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
|
||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||
#endif
|
||||
};
|
||||
@ -170,7 +159,7 @@ public:
|
||||
public:
|
||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||
|
||||
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
|
||||
#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
|
||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||
#endif
|
||||
};
|
||||
|
608
cham_simd.cpp
608
cham_simd.cpp
@ -45,600 +45,6 @@ using CryptoPP::word32;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
||||
NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateLeft16(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi16(val, R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateRight16(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi16(val, 16-R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i RotateLeft16<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi16(val, 8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i RotateRight16<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi16(val, 16-8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// Should not be instantiated
|
||||
CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
|
||||
CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
|
||||
CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
|
||||
CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
|
||||
CRYPTOPP_ASSERT(0);
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// The shuffle converts to and from little-endian for SSE. A specialized
|
||||
// CHAM implementation can avoid the shuffle by framing the data for
|
||||
// encryption, decryption and benchmarks. The library cannot take the
|
||||
// speed-up because of the byte oriented API.
|
||||
const __m128i r1 = _mm_unpacklo_epi16(a, b);
|
||||
const __m128i r2 = _mm_unpacklo_epi16(c, d);
|
||||
const __m128i r3 = _mm_unpacklo_epi16(e, f);
|
||||
const __m128i r4 = _mm_unpacklo_epi16(g, h);
|
||||
|
||||
const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
|
||||
const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
|
||||
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// The shuffle converts to and from little-endian for SSE. A specialized
|
||||
// CHAM implementation can avoid the shuffle by framing the data for
|
||||
// encryption, decryption and benchmarks. The library cannot take the
|
||||
// speed-up because of the byte oriented API.
|
||||
const __m128i r1 = _mm_unpacklo_epi16(a, b);
|
||||
const __m128i r2 = _mm_unpacklo_epi16(c, d);
|
||||
const __m128i r3 = _mm_unpacklo_epi16(e, f);
|
||||
const __m128i r4 = _mm_unpacklo_epi16(g, h);
|
||||
|
||||
const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
|
||||
const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
|
||||
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
|
||||
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// The shuffle converts to and from little-endian for SSE. A specialized
|
||||
// CHAM implementation can avoid the shuffle by framing the data for
|
||||
// encryption, decryption and benchmarks. The library cannot take the
|
||||
// speed-up because of the byte oriented API.
|
||||
const __m128i r1 = _mm_unpacklo_epi16(a, b);
|
||||
const __m128i r2 = _mm_unpacklo_epi16(c, d);
|
||||
const __m128i r3 = _mm_unpacklo_epi16(e, f);
|
||||
const __m128i r4 = _mm_unpacklo_epi16(g, h);
|
||||
|
||||
const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
|
||||
const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
|
||||
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// The shuffle converts to and from little-endian for SSE. A specialized
|
||||
// CHAM implementation can avoid the shuffle by framing the data for
|
||||
// encryption, decryption and benchmarks. The library cannot take the
|
||||
// speed-up because of the byte oriented API.
|
||||
const __m128i r1 = _mm_unpacklo_epi16(a, b);
|
||||
const __m128i r2 = _mm_unpacklo_epi16(c, d);
|
||||
const __m128i r3 = _mm_unpacklo_epi16(e, f);
|
||||
const __m128i r4 = _mm_unpacklo_epi16(g, h);
|
||||
|
||||
const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
|
||||
const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
|
||||
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
|
||||
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// The shuffle converts to and from little-endian for SSE. A specialized
|
||||
// CHAM implementation can avoid the shuffle by framing the data for
|
||||
// encryption, decryption and benchmarks. The library cannot take the
|
||||
// speed-up because of the byte oriented API.
|
||||
const __m128i r1 = _mm_unpackhi_epi16(a, b);
|
||||
const __m128i r2 = _mm_unpackhi_epi16(c, d);
|
||||
const __m128i r3 = _mm_unpackhi_epi16(e, f);
|
||||
const __m128i r4 = _mm_unpackhi_epi16(g, h);
|
||||
|
||||
const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
|
||||
const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
|
||||
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// The shuffle converts to and from little-endian for SSE. A specialized
|
||||
// CHAM implementation can avoid the shuffle by framing the data for
|
||||
// encryption, decryption and benchmarks. The library cannot take the
|
||||
// speed-up because of the byte oriented API.
|
||||
const __m128i r1 = _mm_unpackhi_epi16(a, b);
|
||||
const __m128i r2 = _mm_unpackhi_epi16(c, d);
|
||||
const __m128i r3 = _mm_unpackhi_epi16(e, f);
|
||||
const __m128i r4 = _mm_unpackhi_epi16(g, h);
|
||||
|
||||
const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
|
||||
const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
|
||||
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
|
||||
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// The shuffle converts to and from little-endian for SSE. A specialized
|
||||
// CHAM implementation can avoid the shuffle by framing the data for
|
||||
// encryption, decryption and benchmarks. The library cannot take the
|
||||
// speed-up because of the byte oriented API.
|
||||
const __m128i r1 = _mm_unpackhi_epi16(a, b);
|
||||
const __m128i r2 = _mm_unpackhi_epi16(c, d);
|
||||
const __m128i r3 = _mm_unpackhi_epi16(e, f);
|
||||
const __m128i r4 = _mm_unpackhi_epi16(g, h);
|
||||
|
||||
const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
|
||||
const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
|
||||
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
// The shuffle converts to and from little-endian for SSE. A specialized
|
||||
// CHAM implementation can avoid the shuffle by framing the data for
|
||||
// encryption, decryption and benchmarks. The library cannot take the
|
||||
// speed-up because of the byte oriented API.
|
||||
const __m128i r1 = _mm_unpackhi_epi16(a, b);
|
||||
const __m128i r2 = _mm_unpackhi_epi16(c, d);
|
||||
const __m128i r3 = _mm_unpackhi_epi16(e, f);
|
||||
const __m128i r4 = _mm_unpackhi_epi16(g, h);
|
||||
|
||||
const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
|
||||
const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
|
||||
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
|
||||
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline __m128i UnpackXMM(const __m128i& v)
|
||||
{
|
||||
// Should not be instantiated
|
||||
CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
|
||||
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<0>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<1>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<2>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<3>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<4>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<5>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<6>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<7>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline __m128i UnpackXMM(const __m128i& a, const __m128i& b)
|
||||
{
|
||||
const __m128i& z = _mm_setzero_si128();
|
||||
return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
|
||||
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
|
||||
{
|
||||
return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline __m128i RepackXMM(const __m128i& v)
|
||||
{
|
||||
return UnpackXMM<IDX>(v);
|
||||
}
|
||||
|
||||
inline void CHAM64_Enc_Block(__m128i &block0,
|
||||
const word16 *subkeys, unsigned int /*rounds*/)
|
||||
{
|
||||
// Rearrange the data for vectorization. UnpackXMM includes a
|
||||
// little-endian swap for SSE. Thanks to Peter Cordes for help
|
||||
// with packing and unpacking.
|
||||
// [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
|
||||
__m128i a = UnpackXMM<0>(block0);
|
||||
__m128i b = UnpackXMM<1>(block0);
|
||||
__m128i c = UnpackXMM<2>(block0);
|
||||
__m128i d = UnpackXMM<3>(block0);
|
||||
__m128i e = UnpackXMM<4>(block0);
|
||||
__m128i f = UnpackXMM<5>(block0);
|
||||
__m128i g = UnpackXMM<6>(block0);
|
||||
__m128i h = UnpackXMM<7>(block0);
|
||||
|
||||
const unsigned int rounds = 80;
|
||||
__m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
|
||||
__m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
|
||||
|
||||
const unsigned int MASK = 15;
|
||||
for (int i=0; i<static_cast<int>(rounds); i+=4)
|
||||
{
|
||||
__m128i k, kr, t1, t2, t3, t4;
|
||||
k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
|
||||
|
||||
// Shuffle out key
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
|
||||
|
||||
t1 = _mm_xor_si128(a, counter);
|
||||
t3 = _mm_xor_si128(e, counter);
|
||||
t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
|
||||
a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
|
||||
e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
|
||||
|
||||
counter = _mm_add_epi16(counter, increment);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
|
||||
|
||||
t1 = _mm_xor_si128(b, counter);
|
||||
t3 = _mm_xor_si128(f, counter);
|
||||
t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
|
||||
b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
|
||||
f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
|
||||
|
||||
counter = _mm_add_epi16(counter, increment);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
|
||||
|
||||
t1 = _mm_xor_si128(c, counter);
|
||||
t3 = _mm_xor_si128(g, counter);
|
||||
t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
|
||||
c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
|
||||
g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
|
||||
|
||||
counter = _mm_add_epi16(counter, increment);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
|
||||
|
||||
t1 = _mm_xor_si128(d, counter);
|
||||
t3 = _mm_xor_si128(h, counter);
|
||||
t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
|
||||
d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
|
||||
h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
|
||||
|
||||
counter = _mm_add_epi16(counter, increment);
|
||||
}
|
||||
|
||||
// [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
|
||||
block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
|
||||
}
|
||||
|
||||
inline void CHAM64_Dec_Block(__m128i &block0,
|
||||
const word16 *subkeys, unsigned int /*rounds*/)
|
||||
{
|
||||
// Rearrange the data for vectorization. UnpackXMM includes a
|
||||
// little-endian swap for SSE. Thanks to Peter Cordes for help
|
||||
// with packing and unpacking.
|
||||
// [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
|
||||
__m128i a = UnpackXMM<0>(block0);
|
||||
__m128i b = UnpackXMM<1>(block0);
|
||||
__m128i c = UnpackXMM<2>(block0);
|
||||
__m128i d = UnpackXMM<3>(block0);
|
||||
__m128i e = UnpackXMM<4>(block0);
|
||||
__m128i f = UnpackXMM<5>(block0);
|
||||
__m128i g = UnpackXMM<6>(block0);
|
||||
__m128i h = UnpackXMM<7>(block0);
|
||||
|
||||
const unsigned int rounds = 80;
|
||||
__m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
|
||||
__m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
|
||||
|
||||
const unsigned int MASK = 15;
|
||||
for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
|
||||
{
|
||||
__m128i k, kr, t1, t2, t3, t4;
|
||||
k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
|
||||
|
||||
// Shuffle out key
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
|
||||
|
||||
// Odd round
|
||||
t1 = RotateRight16<1>(d);
|
||||
t3 = RotateRight16<1>(h);
|
||||
t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
|
||||
d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
|
||||
h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
|
||||
|
||||
counter = _mm_sub_epi16(counter, decrement);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
|
||||
|
||||
// Even round
|
||||
t1 = RotateRight16<8>(c);
|
||||
t3 = RotateRight16<8>(g);
|
||||
t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
|
||||
c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
|
||||
g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
|
||||
|
||||
counter = _mm_sub_epi16(counter, decrement);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
|
||||
|
||||
// Odd round
|
||||
t1 = RotateRight16<1>(b);
|
||||
t3 = RotateRight16<1>(f);
|
||||
t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
|
||||
b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
|
||||
f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
|
||||
|
||||
counter = _mm_sub_epi16(counter, decrement);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
|
||||
|
||||
// Even round
|
||||
t1 = RotateRight16<8>(a);
|
||||
t3 = RotateRight16<8>(e);
|
||||
t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
|
||||
a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
|
||||
e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
|
||||
|
||||
counter = _mm_sub_epi16(counter, decrement);
|
||||
}
|
||||
|
||||
// [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
|
||||
block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
|
||||
}
|
||||
|
||||
inline void CHAM64_Enc_2_Blocks(__m128i &block0,
|
||||
__m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
|
||||
{
|
||||
// Rearrange the data for vectorization. UnpackXMM includes a
|
||||
// little-endian swap for SSE. Thanks to Peter Cordes for help
|
||||
// with packing and unpacking.
|
||||
// [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
|
||||
__m128i a = UnpackXMM<0>(block0, block1);
|
||||
__m128i b = UnpackXMM<1>(block0, block1);
|
||||
__m128i c = UnpackXMM<2>(block0, block1);
|
||||
__m128i d = UnpackXMM<3>(block0, block1);
|
||||
__m128i e = UnpackXMM<4>(block0, block1);
|
||||
__m128i f = UnpackXMM<5>(block0, block1);
|
||||
__m128i g = UnpackXMM<6>(block0, block1);
|
||||
__m128i h = UnpackXMM<7>(block0, block1);
|
||||
|
||||
const unsigned int rounds = 80;
|
||||
__m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
|
||||
__m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
|
||||
|
||||
const unsigned int MASK = 15;
|
||||
for (int i=0; i<static_cast<int>(rounds); i+=4)
|
||||
{
|
||||
__m128i k, kr, t1, t2, t3, t4;
|
||||
k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
|
||||
|
||||
// Shuffle out key
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
|
||||
|
||||
t1 = _mm_xor_si128(a, counter);
|
||||
t3 = _mm_xor_si128(e, counter);
|
||||
t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
|
||||
a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
|
||||
e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
|
||||
|
||||
counter = _mm_add_epi16(counter, increment);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
|
||||
|
||||
t1 = _mm_xor_si128(b, counter);
|
||||
t3 = _mm_xor_si128(f, counter);
|
||||
t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
|
||||
b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
|
||||
f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
|
||||
|
||||
counter = _mm_add_epi16(counter, increment);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
|
||||
|
||||
t1 = _mm_xor_si128(c, counter);
|
||||
t3 = _mm_xor_si128(g, counter);
|
||||
t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
|
||||
c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
|
||||
g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
|
||||
|
||||
counter = _mm_add_epi16(counter, increment);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
|
||||
|
||||
t1 = _mm_xor_si128(d, counter);
|
||||
t3 = _mm_xor_si128(h, counter);
|
||||
t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
|
||||
d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
|
||||
h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
|
||||
|
||||
counter = _mm_add_epi16(counter, increment);
|
||||
}
|
||||
|
||||
// [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
|
||||
block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
|
||||
block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
|
||||
}
|
||||
|
||||
inline void CHAM64_Dec_2_Blocks(__m128i &block0,
|
||||
__m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
|
||||
{
|
||||
// Rearrange the data for vectorization. UnpackXMM includes a
|
||||
// little-endian swap for SSE. Thanks to Peter Cordes for help
|
||||
// with packing and unpacking.
|
||||
// [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
|
||||
__m128i a = UnpackXMM<0>(block0, block1);
|
||||
__m128i b = UnpackXMM<1>(block0, block1);
|
||||
__m128i c = UnpackXMM<2>(block0, block1);
|
||||
__m128i d = UnpackXMM<3>(block0, block1);
|
||||
__m128i e = UnpackXMM<4>(block0, block1);
|
||||
__m128i f = UnpackXMM<5>(block0, block1);
|
||||
__m128i g = UnpackXMM<6>(block0, block1);
|
||||
__m128i h = UnpackXMM<7>(block0, block1);
|
||||
|
||||
const unsigned int rounds = 80;
|
||||
__m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
|
||||
__m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
|
||||
|
||||
const unsigned int MASK = 15;
|
||||
for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
|
||||
{
|
||||
__m128i k, kr, t1, t2, t3, t4;
|
||||
k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
|
||||
|
||||
// Shuffle out key
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
|
||||
|
||||
// Odd round
|
||||
t1 = RotateRight16<1>(d);
|
||||
t3 = RotateRight16<1>(h);
|
||||
t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
|
||||
d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
|
||||
h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
|
||||
|
||||
counter = _mm_sub_epi16(counter, decrement);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
|
||||
|
||||
// Even round
|
||||
t1 = RotateRight16<8>(c);
|
||||
t3 = RotateRight16<8>(g);
|
||||
t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
|
||||
c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
|
||||
g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
|
||||
|
||||
counter = _mm_sub_epi16(counter, decrement);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
|
||||
|
||||
// Odd round
|
||||
t1 = RotateRight16<1>(b);
|
||||
t3 = RotateRight16<1>(f);
|
||||
t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
|
||||
b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
|
||||
f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
|
||||
|
||||
counter = _mm_sub_epi16(counter, decrement);
|
||||
kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
|
||||
|
||||
// Even round
|
||||
t1 = RotateRight16<8>(a);
|
||||
t3 = RotateRight16<8>(e);
|
||||
t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
|
||||
t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
|
||||
a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
|
||||
e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
|
||||
|
||||
counter = _mm_sub_epi16(counter, decrement);
|
||||
}
|
||||
|
||||
// [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
|
||||
block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
|
||||
block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
|
||||
}
|
||||
|
||||
NAMESPACE_END // W16
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
||||
NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size
|
||||
|
||||
template <unsigned int R>
|
||||
@ -1054,20 +460,6 @@ ANONYMOUS_NAMESPACE_END
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
|
@ -78,9 +78,9 @@ LIB_SRCS = \
|
||||
rdtables.cpp rijndael.cpp rijndael_simd.cpp ripemd.cpp rng.cpp rsa.cpp \
|
||||
rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp \
|
||||
sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp shake.cpp \
|
||||
shark.cpp sharkbox.cpp simeck.cpp simeck_simd.cpp simon.cpp \
|
||||
simon128_simd.cpp simon64_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \
|
||||
sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp speck64_simd.cpp \
|
||||
shark.cpp sharkbox.cpp simeck.cpp simon.cpp \
|
||||
simon128_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \
|
||||
sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp \
|
||||
square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \
|
||||
threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \
|
||||
vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp \
|
||||
@ -109,9 +109,9 @@ LIB_OBJS = \
|
||||
rdtables.obj rijndael.obj rijndael_simd.obj ripemd.obj rng.obj rsa.obj \
|
||||
rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj \
|
||||
sha.obj sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj shake.obj \
|
||||
shark.obj sharkbox.obj simeck.obj simeck_simd.obj simon.obj \
|
||||
simon128_simd.obj simon64_simd.obj skipjack.obj sm3.obj sm4.obj \
|
||||
sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj speck64_simd.obj \
|
||||
shark.obj sharkbox.obj simeck.obj simon.obj \
|
||||
simon128_simd.obj skipjack.obj sm3.obj sm4.obj \
|
||||
sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj \
|
||||
square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \
|
||||
threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \
|
||||
vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj \
|
||||
|
@ -315,9 +315,7 @@
|
||||
<ClCompile Include="shark.cpp" />
|
||||
<ClCompile Include="sharkbox.cpp" />
|
||||
<ClCompile Include="simeck.cpp" />
|
||||
<ClCompile Include="simeck_simd.cpp" />
|
||||
<ClCompile Include="simon.cpp" />
|
||||
<ClCompile Include="simon64_simd.cpp" />
|
||||
<ClCompile Include="simon128_simd.cpp" />
|
||||
<ClCompile Include="simple.cpp" />
|
||||
<ClCompile Include="skipjack.cpp" />
|
||||
@ -326,7 +324,6 @@
|
||||
<ClCompile Include="sm4_simd.cpp" />
|
||||
<ClCompile Include="sosemanuk.cpp" />
|
||||
<ClCompile Include="speck.cpp" />
|
||||
<ClCompile Include="speck64_simd.cpp" />
|
||||
<ClCompile Include="speck128_simd.cpp" />
|
||||
<ClCompile Include="square.cpp" />
|
||||
<ClCompile Include="squaretb.cpp" />
|
||||
|
@ -425,15 +425,9 @@
|
||||
<ClCompile Include="simeck.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="simeck_simd.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="simon.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="simon64_simd.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="simon128_simd.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
@ -455,9 +449,6 @@
|
||||
<ClCompile Include="speck.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="speck64_simd.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="speck128_simd.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
40
simeck.cpp
40
simeck.cpp
@ -33,16 +33,6 @@ ANONYMOUS_NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
|
||||
# if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
extern size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
# endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
#endif // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
std::string SIMECK32::Base::AlgorithmProvider() const
|
||||
{
|
||||
return "C++";
|
||||
@ -104,10 +94,6 @@ void SIMECK32::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
||||
|
||||
std::string SIMECK64::Base::AlgorithmProvider() const
|
||||
{
|
||||
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3())
|
||||
return "SSSE3";
|
||||
#endif
|
||||
return "C++";
|
||||
}
|
||||
|
||||
@ -165,30 +151,4 @@ void SIMECK64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
||||
oblock(m_t[0])(m_t[1]);
|
||||
}
|
||||
|
||||
#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
|
||||
size_t SIMECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
# if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3()) {
|
||||
return SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
# endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SIMECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
# if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3()) {
|
||||
return SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
# endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
NAMESPACE_END
|
||||
|
13
simeck.h
13
simeck.h
@ -17,19 +17,6 @@
|
||||
#include "secblock.h"
|
||||
#include "algparam.h"
|
||||
|
||||
#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
|
||||
# define CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS 1
|
||||
#endif
|
||||
|
||||
// Yet another SunStudio/SunCC workaround. Failed self tests
|
||||
// in SSE code paths on i386 for SunStudio 12.3 and below.
|
||||
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
|
||||
# undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
|
||||
#endif
|
||||
|
||||
// https://github.com/weidai11/cryptopp/issues/945
|
||||
#undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
/// \brief SIMECK block cipher information
|
||||
|
342
simeck_simd.cpp
342
simeck_simd.cpp
@ -1,342 +0,0 @@
|
||||
// simeck_simd.cpp - written and placed in the public domain by Gangqiang Yang and Jeffrey Walton.
|
||||
//
|
||||
// This source file uses intrinsics and built-ins to gain access to
|
||||
// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
|
||||
// source file is needed because additional CXXFLAGS are required to enable
|
||||
// the appropriate instructions sets in some build configurations.
|
||||
|
||||
#include "pch.h"
|
||||
#include "config.h"
|
||||
|
||||
#include "simeck.h"
|
||||
#include "misc.h"
|
||||
|
||||
// Uncomment for benchmarking C++ against SSE or NEON.
|
||||
// Do so in both simon.cpp and simon_simd.cpp.
|
||||
// #undef CRYPTOPP_SSSE3_AVAILABLE
|
||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
# include "adv_simd.h"
|
||||
# include <pmmintrin.h>
|
||||
# include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__XOP__)
|
||||
# include <ammintrin.h>
|
||||
# if defined(__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// Squash MS LNK4221 and libtool warnings
|
||||
extern const char SIMECK_SIMD_FNAME[] = __FILE__;
|
||||
|
||||
// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
|
||||
#define M128_CAST(x) ((__m128i *)(void *)(x))
|
||||
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
using CryptoPP::word16;
|
||||
using CryptoPP::word32;
|
||||
|
||||
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateLeft32(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateRight32(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline __m128i RotateLeft32<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline __m128i RotateRight32<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Unpack XMM words
|
||||
/// \tparam IDX the element from each XMM word
|
||||
/// \param a the first XMM word
|
||||
/// \param b the second XMM word
|
||||
/// \param c the third XMM word
|
||||
/// \param d the fourth XMM word
|
||||
/// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation
|
||||
/// equivalent to <tt>a[IDX] || b[IDX] || c[IDX] || d[IDX]</tt>.
|
||||
template <unsigned int IDX>
|
||||
inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
|
||||
{
|
||||
// Should not be instantiated
|
||||
CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
|
||||
CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
|
||||
CRYPTOPP_ASSERT(0);
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
|
||||
{
|
||||
const __m128i r1 = _mm_unpacklo_epi32(a, b);
|
||||
const __m128i r2 = _mm_unpacklo_epi32(c, d);
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
|
||||
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
|
||||
{
|
||||
const __m128i r1 = _mm_unpacklo_epi32(a, b);
|
||||
const __m128i r2 = _mm_unpacklo_epi32(c, d);
|
||||
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
|
||||
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
|
||||
{
|
||||
const __m128i r1 = _mm_unpackhi_epi32(a, b);
|
||||
const __m128i r2 = _mm_unpackhi_epi32(c, d);
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
|
||||
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
|
||||
{
|
||||
const __m128i r1 = _mm_unpackhi_epi32(a, b);
|
||||
const __m128i r2 = _mm_unpackhi_epi32(c, d);
|
||||
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
|
||||
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
|
||||
}
|
||||
|
||||
/// \brief Unpack a XMM word
|
||||
/// \tparam IDX the element from each XMM word
|
||||
/// \param v the first XMM word
|
||||
/// \details UnpackXMM selects the IDX element from v and returns a concatenation
|
||||
/// equivalent to <tt>v[IDX] || v[IDX] || v[IDX] || v[IDX]</tt>.
|
||||
template <unsigned int IDX>
|
||||
inline __m128i UnpackXMM(const __m128i& v)
|
||||
{
|
||||
// Should not be instantiated
|
||||
CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<0>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<1>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<2>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i UnpackXMM<3>(const __m128i& v)
|
||||
{
|
||||
return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
|
||||
{
|
||||
return UnpackXMM<IDX>(a, b, c, d);
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline __m128i RepackXMM(const __m128i& v)
|
||||
{
|
||||
return UnpackXMM<IDX>(v);
|
||||
}
|
||||
|
||||
inline void SIMECK64_Encrypt(__m128i &a, __m128i &b, __m128i &c, __m128i &d, const __m128i key)
|
||||
{
|
||||
// SunStudio 12.3 workaround
|
||||
__m128i s, t; s = a; t = c;
|
||||
a = _mm_xor_si128(_mm_and_si128(a, RotateLeft32<5>(a)), RotateLeft32<1>(a));
|
||||
c = _mm_xor_si128(_mm_and_si128(c, RotateLeft32<5>(c)), RotateLeft32<1>(c));
|
||||
a = _mm_xor_si128(a, _mm_xor_si128(b, key));
|
||||
c = _mm_xor_si128(c, _mm_xor_si128(d, key));
|
||||
b = s; d = t;
|
||||
}
|
||||
|
||||
inline void SIMECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
|
||||
__m128i a = UnpackXMM<0>(block0);
|
||||
__m128i b = UnpackXMM<1>(block0);
|
||||
__m128i c = UnpackXMM<2>(block0);
|
||||
__m128i d = UnpackXMM<3>(block0);
|
||||
|
||||
const unsigned int rounds = 44;
|
||||
for (int i = 0; i < static_cast<int>(rounds); i += 4)
|
||||
{
|
||||
const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
}
|
||||
|
||||
// [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
|
||||
block0 = RepackXMM<0>(a,b,c,d);
|
||||
}
|
||||
|
||||
inline void SIMECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/)
|
||||
{
|
||||
// SIMECK requires a word swap for the decryption transform
|
||||
__m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
|
||||
__m128i a = UnpackXMM<0>(w);
|
||||
__m128i b = UnpackXMM<1>(w);
|
||||
__m128i c = UnpackXMM<2>(w);
|
||||
__m128i d = UnpackXMM<3>(w);
|
||||
|
||||
const unsigned int rounds = 44;
|
||||
for (int i = static_cast<int>(rounds)-1; i >= 0; i -= 4)
|
||||
{
|
||||
const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
}
|
||||
|
||||
// [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
|
||||
w = RepackXMM<0>(a,b,c,d);
|
||||
|
||||
block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
}
|
||||
|
||||
inline void SIMECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
|
||||
__m128i a = UnpackXMM<0>(block0, block1, block2, block3);
|
||||
__m128i b = UnpackXMM<1>(block0, block1, block2, block3);
|
||||
__m128i c = UnpackXMM<2>(block0, block1, block2, block3);
|
||||
__m128i d = UnpackXMM<3>(block0, block1, block2, block3);
|
||||
|
||||
const unsigned int rounds = 44;
|
||||
for (int i = 0; i < static_cast<int>(rounds); i += 4)
|
||||
{
|
||||
const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
}
|
||||
|
||||
// [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
|
||||
block0 = RepackXMM<0>(a, b, c, d);
|
||||
block1 = RepackXMM<1>(a, b, c, d);
|
||||
block2 = RepackXMM<2>(a, b, c, d);
|
||||
block3 = RepackXMM<3>(a, b, c, d);
|
||||
}
|
||||
|
||||
inline void SIMECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
|
||||
{
|
||||
// SIMECK requires a word swap for the decryption transform
|
||||
__m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128i x = _mm_shuffle_epi32(block1, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128i y = _mm_shuffle_epi32(block2, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128i z = _mm_shuffle_epi32(block3, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
|
||||
__m128i a = UnpackXMM<0>(w, x, y, z);
|
||||
__m128i b = UnpackXMM<1>(w, x, y, z);
|
||||
__m128i c = UnpackXMM<2>(w, x, y, z);
|
||||
__m128i d = UnpackXMM<3>(w, x, y, z);
|
||||
|
||||
const unsigned int rounds = 44;
|
||||
for (int i = static_cast<int>(rounds)-1; i >= 0; i -= 4)
|
||||
{
|
||||
const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
}
|
||||
|
||||
// [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
|
||||
w = RepackXMM<0>(a, b, c, d);
|
||||
x = RepackXMM<1>(a, b, c, d);
|
||||
y = RepackXMM<2>(a, b, c, d);
|
||||
z = RepackXMM<3>(a, b, c, d);
|
||||
|
||||
block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
block1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
block2 = _mm_shuffle_epi32(y, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
block3 = _mm_shuffle_epi32(z, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Enc_Block, SIMECK64_Enc_4_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Dec_Block, SIMECK64_Dec_4_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
|
||||
NAMESPACE_END
|
121
simon.cpp
121
simon.cpp
@ -196,14 +196,6 @@ ANONYMOUS_NAMESPACE_END
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
extern size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
extern size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
@ -212,14 +204,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, siz
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
extern size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
extern size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
@ -228,14 +212,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
extern size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
extern size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
@ -246,39 +222,11 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys,
|
||||
|
||||
std::string SIMON64::Base::AlgorithmProvider() const
|
||||
{
|
||||
#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
|
||||
# if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
if (HasSSE41())
|
||||
return "SSE4.1";
|
||||
# endif
|
||||
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return "NEON";
|
||||
# endif
|
||||
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return "Altivec";
|
||||
# endif
|
||||
#endif
|
||||
return "C++";
|
||||
}
|
||||
|
||||
unsigned int SIMON64::Base::OptimalDataAlignment() const
|
||||
{
|
||||
#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
|
||||
# if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
if (HasSSE41())
|
||||
return 16; // load __m128i
|
||||
# endif
|
||||
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return 4; // load uint32x4_t
|
||||
# endif
|
||||
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return 16; // load uint32x4_p
|
||||
# endif
|
||||
#endif
|
||||
return GetAlignmentOf<word32>();
|
||||
}
|
||||
|
||||
@ -311,29 +259,6 @@ void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
|
||||
default:
|
||||
CRYPTOPP_ASSERT(0);
|
||||
}
|
||||
|
||||
#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
// Pre-splat the round keys for Altivec forward transformation
|
||||
#if CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
if (IsForwardTransformation() && HasAltivec())
|
||||
{
|
||||
AlignedSecBlock presplat(m_rkeys.size()*4);
|
||||
for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
|
||||
presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
|
||||
m_rkeys.swap(presplat);
|
||||
}
|
||||
#elif CRYPTOPP_SSE41_AVAILABLE
|
||||
if (IsForwardTransformation() && HasSSE41())
|
||||
{
|
||||
AlignedSecBlock presplat(m_rkeys.size()*4);
|
||||
for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
|
||||
presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
|
||||
m_rkeys.swap(presplat);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
|
||||
}
|
||||
|
||||
void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||
@ -478,7 +403,7 @@ void SIMON128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
|
||||
#endif // CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
|
||||
}
|
||||
|
||||
void SIMON128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||
@ -533,50 +458,6 @@ void SIMON128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
||||
OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
|
||||
size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
if (HasSSE41())
|
||||
return SIMON64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
if (HasSSE41())
|
||||
return SIMON64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
#if (CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS)
|
||||
size_t SIMON128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
|
18
simon.h
18
simon.h
@ -17,14 +17,6 @@
|
||||
#include "seckey.h"
|
||||
#include "secblock.h"
|
||||
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
|
||||
CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
|
||||
CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
|
||||
# ifndef CRYPTOPP_DISABLE_SIMON_SIMD
|
||||
# define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
|
||||
CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
|
||||
CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
|
||||
@ -36,13 +28,9 @@
|
||||
// Yet another SunStudio/SunCC workaround. Failed self tests
|
||||
// in SSE code paths on i386 for SunStudio 12.3 and below.
|
||||
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
|
||||
# undef CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
|
||||
# undef CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
|
||||
#endif
|
||||
|
||||
// https://github.com/weidai11/cryptopp/issues/945
|
||||
#undef CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
/// \brief SIMON block cipher information
|
||||
@ -129,9 +117,6 @@ public:
|
||||
{
|
||||
public:
|
||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||
#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
|
||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||
#endif
|
||||
};
|
||||
|
||||
/// \brief SIMON64 decryption transformation
|
||||
@ -142,9 +127,6 @@ public:
|
||||
{
|
||||
public:
|
||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||
#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
|
||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
|
||||
|
864
simon64_simd.cpp
864
simon64_simd.cpp
@ -1,864 +0,0 @@
|
||||
// simon_simd.cpp - written and placed in the public domain by Jeffrey Walton
|
||||
//
|
||||
// This source file uses intrinsics and built-ins to gain access to
|
||||
// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
|
||||
// source file is needed because additional CXXFLAGS are required to enable
|
||||
// the appropriate instructions sets in some build configurations.
|
||||
|
||||
#include "pch.h"
|
||||
#include "config.h"
|
||||
|
||||
#include "simon.h"
|
||||
#include "misc.h"
|
||||
|
||||
// Uncomment for benchmarking C++ against SSE or NEON.
|
||||
// Do so in both simon.cpp and simon_simd.cpp.
|
||||
// #undef CRYPTOPP_SSE41_AVAILABLE
|
||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
# include "adv_simd.h"
|
||||
# include <pmmintrin.h>
|
||||
# include <tmmintrin.h>
|
||||
# include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__XOP__)
|
||||
# include <ammintrin.h>
|
||||
# if defined(__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_HEADER)
|
||||
# include "adv_simd.h"
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ARM_ACLE_HEADER)
|
||||
# include <stdint.h>
|
||||
# include <arm_acle.h>
|
||||
#endif
|
||||
|
||||
#if defined(_M_ARM64)
|
||||
# include "adv_simd.h"
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
# include "adv_simd.h"
|
||||
# include "ppc_simd.h"
|
||||
#endif
|
||||
|
||||
// Squash MS LNK4221 and libtool warnings
|
||||
extern const char SIMON64_SIMD_FNAME[] = __FILE__;
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
using CryptoPP::byte;
|
||||
using CryptoPP::word32;
|
||||
using CryptoPP::word64;
|
||||
using CryptoPP::vec_swap; // SunCC
|
||||
|
||||
// *************************** ARM NEON ************************** //
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
|
||||
template <class T>
|
||||
inline T UnpackHigh32(const T& a, const T& b)
|
||||
{
|
||||
const uint32x2_t x(vget_high_u32((uint32x4_t)a));
|
||||
const uint32x2_t y(vget_high_u32((uint32x4_t)b));
|
||||
const uint32x2x2_t r = vzip_u32(x, y);
|
||||
return (T)vcombine_u32(r.val[0], r.val[1]);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline T UnpackLow32(const T& a, const T& b)
|
||||
{
|
||||
const uint32x2_t x(vget_low_u32((uint32x4_t)a));
|
||||
const uint32x2_t y(vget_low_u32((uint32x4_t)b));
|
||||
const uint32x2x2_t r = vzip_u32(x, y);
|
||||
return (T)vcombine_u32(r.val[0], r.val[1]);
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
||||
{
|
||||
const uint32x4_t a(vshlq_n_u32(val, R));
|
||||
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
|
||||
return vorrq_u32(a, b);
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline uint32x4_t RotateRight32(const uint32x4_t& val)
|
||||
{
|
||||
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
|
||||
const uint32x4_t b(vshrq_n_u32(val, R));
|
||||
return vorrq_u32(a, b);
|
||||
}
|
||||
|
||||
#if defined(__aarch32__) || defined(__aarch64__)
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
|
||||
{
|
||||
const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
|
||||
const uint8x16_t mask = vld1q_u8(maskb);
|
||||
|
||||
return vreinterpretq_u32_u8(
|
||||
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
|
||||
{
|
||||
const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
|
||||
const uint8x16_t mask = vld1q_u8(maskb);
|
||||
|
||||
return vreinterpretq_u32_u8(
|
||||
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
|
||||
}
|
||||
#endif
|
||||
|
||||
inline uint32x4_t SIMON64_f(const uint32x4_t& val)
|
||||
{
|
||||
return veorq_u32(RotateLeft32<2>(val),
|
||||
vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
|
||||
}
|
||||
|
||||
inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
|
||||
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
|
||||
|
||||
const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
|
||||
x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
|
||||
}
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
|
||||
|
||||
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
|
||||
std::swap(x1, y1);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
std::swap(x1, y1);
|
||||
const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
|
||||
|
||||
y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
|
||||
rounds--;
|
||||
}
|
||||
|
||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||
{
|
||||
const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
|
||||
x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
|
||||
|
||||
const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
|
||||
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
|
||||
{
|
||||
const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
|
||||
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
|
||||
y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
|
||||
y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
|
||||
|
||||
const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
|
||||
x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
|
||||
x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
|
||||
x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
|
||||
}
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
|
||||
|
||||
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
|
||||
y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
|
||||
y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
|
||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
block2 = UnpackLow32(y2, x2);
|
||||
block3 = UnpackHigh32(y2, x2);
|
||||
block4 = UnpackLow32(y3, x3);
|
||||
block5 = UnpackHigh32(y3, x3);
|
||||
}
|
||||
|
||||
inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||
const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
|
||||
|
||||
y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
|
||||
y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
|
||||
y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
|
||||
rounds--;
|
||||
}
|
||||
|
||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||
{
|
||||
const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
|
||||
x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
|
||||
x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
|
||||
x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
|
||||
|
||||
const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
|
||||
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
|
||||
y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
|
||||
y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
block2 = UnpackLow32(y2, x2);
|
||||
block3 = UnpackHigh32(y2, x2);
|
||||
block4 = UnpackLow32(y3, x3);
|
||||
block5 = UnpackHigh32(y3, x3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
// ***************************** IA-32 ***************************** //
|
||||
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
|
||||
// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
|
||||
#ifndef M128_CAST
|
||||
# define M128_CAST(x) ((__m128i *)(void *)(x))
|
||||
#endif
|
||||
#ifndef CONST_M128_CAST
|
||||
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||
#endif
|
||||
|
||||
inline void Swap128(__m128i& a,__m128i& b)
|
||||
{
|
||||
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
|
||||
// __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
|
||||
// SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
|
||||
vec_swap(a, b);
|
||||
#else
|
||||
std::swap(a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateLeft32(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateRight32(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
__m128i RotateLeft32<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
__m128i RotateRight32<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __m128i SIMON64_f(const __m128i& v)
|
||||
{
|
||||
return _mm_xor_si128(RotateLeft32<2>(v),
|
||||
_mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
|
||||
}
|
||||
|
||||
inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
const __m128 t0 = _mm_castsi128_ps(block0);
|
||||
const __m128 t1 = _mm_castsi128_ps(block1);
|
||||
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
|
||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
|
||||
|
||||
const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4));
|
||||
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
|
||||
}
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4));
|
||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
|
||||
Swap128(x1, y1);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = _mm_unpacklo_epi32(y1, x1);
|
||||
block1 = _mm_unpackhi_epi32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
const __m128 t0 = _mm_castsi128_ps(block0);
|
||||
const __m128 t1 = _mm_castsi128_ps(block1);
|
||||
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
Swap128(x1, y1);
|
||||
const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
|
||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
|
||||
rounds--;
|
||||
}
|
||||
|
||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||
{
|
||||
const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
|
||||
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
|
||||
|
||||
const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
|
||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = _mm_unpacklo_epi32(y1, x1);
|
||||
block1 = _mm_unpackhi_epi32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
const __m128 t0 = _mm_castsi128_ps(block0);
|
||||
const __m128 t1 = _mm_castsi128_ps(block1);
|
||||
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
const __m128 t2 = _mm_castsi128_ps(block2);
|
||||
const __m128 t3 = _mm_castsi128_ps(block3);
|
||||
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
const __m128 t4 = _mm_castsi128_ps(block4);
|
||||
const __m128 t5 = _mm_castsi128_ps(block5);
|
||||
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
|
||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
|
||||
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
|
||||
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
|
||||
|
||||
const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4));
|
||||
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
|
||||
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
|
||||
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
|
||||
}
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4));
|
||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
|
||||
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
|
||||
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
|
||||
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = _mm_unpacklo_epi32(y1, x1);
|
||||
block1 = _mm_unpackhi_epi32(y1, x1);
|
||||
block2 = _mm_unpacklo_epi32(y2, x2);
|
||||
block3 = _mm_unpackhi_epi32(y2, x2);
|
||||
block4 = _mm_unpacklo_epi32(y3, x3);
|
||||
block5 = _mm_unpackhi_epi32(y3, x3);
|
||||
}
|
||||
|
||||
inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
const __m128 t0 = _mm_castsi128_ps(block0);
|
||||
const __m128 t1 = _mm_castsi128_ps(block1);
|
||||
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
const __m128 t2 = _mm_castsi128_ps(block2);
|
||||
const __m128 t3 = _mm_castsi128_ps(block3);
|
||||
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
const __m128 t4 = _mm_castsi128_ps(block4);
|
||||
const __m128 t5 = _mm_castsi128_ps(block5);
|
||||
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
|
||||
const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
|
||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
|
||||
y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
|
||||
y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
|
||||
rounds--;
|
||||
}
|
||||
|
||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||
{
|
||||
const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
|
||||
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
|
||||
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
|
||||
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
|
||||
|
||||
const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
|
||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
|
||||
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
|
||||
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = _mm_unpacklo_epi32(y1, x1);
|
||||
block1 = _mm_unpackhi_epi32(y1, x1);
|
||||
block2 = _mm_unpacklo_epi32(y2, x2);
|
||||
block3 = _mm_unpackhi_epi32(y2, x2);
|
||||
block4 = _mm_unpacklo_epi32(y3, x3);
|
||||
block5 = _mm_unpackhi_epi32(y3, x3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_SSE41_AVAILABLE
|
||||
|
||||
// ***************************** Altivec ***************************** //
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
|
||||
using CryptoPP::uint8x16_p;
|
||||
using CryptoPP::uint32x4_p;
|
||||
|
||||
using CryptoPP::VecAnd;
|
||||
using CryptoPP::VecXor;
|
||||
using CryptoPP::VecLoad;
|
||||
using CryptoPP::VecLoadAligned;
|
||||
using CryptoPP::VecPermute;
|
||||
|
||||
// Rotate left by bit count
|
||||
template<unsigned int C>
|
||||
inline uint32x4_p RotateLeft32(const uint32x4_p val)
|
||||
{
|
||||
const uint32x4_p m = {C, C, C, C};
|
||||
return vec_rl(val, m);
|
||||
}
|
||||
|
||||
// Rotate right by bit count
|
||||
template<unsigned int C>
|
||||
inline uint32x4_p RotateRight32(const uint32x4_p val)
|
||||
{
|
||||
const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
|
||||
return vec_rl(val, m);
|
||||
}
|
||||
|
||||
inline uint32x4_p SIMON64_f(const uint32x4_p val)
|
||||
{
|
||||
return VecXor(RotateLeft32<2>(val),
|
||||
VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
|
||||
}
|
||||
|
||||
inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
#else
|
||||
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_p x1 = VecPermute(block0, block1, m1);
|
||||
uint32x4_p y1 = VecPermute(block0, block1, m2);
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4);
|
||||
const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4);
|
||||
|
||||
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
|
||||
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
|
||||
}
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4);
|
||||
|
||||
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
|
||||
std::swap(x1, y1);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
|
||||
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
|
||||
#else
|
||||
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
|
||||
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||
}
|
||||
|
||||
inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
#else
|
||||
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_p x1 = VecPermute(block0, block1, m1);
|
||||
uint32x4_p y1 = VecPermute(block0, block1, m2);
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
std::swap(x1, y1);
|
||||
#if defined(_ARCH_PWR7)
|
||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk = VecLoad(subkeys+rounds-1);
|
||||
rk = VecPermute(rk, rk, m);
|
||||
#endif
|
||||
y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
|
||||
rounds--;
|
||||
}
|
||||
|
||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||
{
|
||||
#if defined(_ARCH_PWR7)
|
||||
const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
|
||||
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk1 = VecLoad(subkeys+i+1);
|
||||
uint32x4_p rk2 = VecLoad(subkeys+i);
|
||||
rk1 = VecPermute(rk1, rk1, m);
|
||||
rk2 = VecPermute(rk2, rk2, m);
|
||||
#endif
|
||||
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
|
||||
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
|
||||
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
|
||||
#else
|
||||
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
|
||||
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||
}
|
||||
|
||||
inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
|
||||
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
#else
|
||||
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
|
||||
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
|
||||
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
|
||||
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
|
||||
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
|
||||
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4);
|
||||
const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4);
|
||||
|
||||
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
|
||||
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
|
||||
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
|
||||
|
||||
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
|
||||
x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
|
||||
x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
|
||||
}
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4);
|
||||
|
||||
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
|
||||
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
|
||||
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
|
||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
|
||||
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
|
||||
#else
|
||||
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
|
||||
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||
}
|
||||
|
||||
inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
|
||||
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
#else
|
||||
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
|
||||
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
|
||||
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
|
||||
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
|
||||
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
|
||||
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||
#if defined(_ARCH_PWR7)
|
||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk = VecLoad(subkeys+rounds-1);
|
||||
rk = VecPermute(rk, rk, m);
|
||||
#endif
|
||||
y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
|
||||
y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
|
||||
y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
|
||||
rounds--;
|
||||
}
|
||||
|
||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||
{
|
||||
#if defined(_ARCH_PWR7)
|
||||
const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
|
||||
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk1 = VecLoad(subkeys+i+1);
|
||||
uint32x4_p rk2 = VecLoad(subkeys+i);
|
||||
rk1 = VecPermute(rk1, rk1, m);
|
||||
rk2 = VecPermute(rk2, rk2, m);
|
||||
#endif
|
||||
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
|
||||
x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
|
||||
x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
|
||||
|
||||
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
|
||||
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
|
||||
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
|
||||
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
|
||||
#else
|
||||
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
|
||||
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
// *************************** ARM NEON **************************** //
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
// ***************************** IA-32 ***************************** //
|
||||
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ***************************** Altivec ***************************** //
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
NAMESPACE_END
|
109
speck.cpp
109
speck.cpp
@ -171,12 +171,6 @@ ANONYMOUS_NAMESPACE_END
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
extern size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
@ -200,14 +194,6 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
extern size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
extern size_t SPECK128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
@ -218,39 +204,11 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys,
|
||||
|
||||
std::string SPECK64::Base::AlgorithmProvider() const
|
||||
{
|
||||
#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
|
||||
# if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
if (HasSSE41())
|
||||
return "SSE4.1";
|
||||
# endif
|
||||
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return "NEON";
|
||||
# endif
|
||||
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return "Altivec";
|
||||
# endif
|
||||
#endif
|
||||
return "C++";
|
||||
}
|
||||
|
||||
unsigned int SPECK64::Base::OptimalDataAlignment() const
|
||||
{
|
||||
#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
|
||||
# if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
if (HasSSE41())
|
||||
return 16; // load __m128i
|
||||
# endif
|
||||
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return 4; // load uint32x4_t
|
||||
# endif
|
||||
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return 16; // load uint32x4_p
|
||||
# endif
|
||||
#endif
|
||||
return GetAlignmentOf<word32>();
|
||||
}
|
||||
|
||||
@ -283,29 +241,6 @@ void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
|
||||
default:
|
||||
CRYPTOPP_ASSERT(0);
|
||||
}
|
||||
|
||||
#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
// Pre-splat the round keys for Altivec forward transformation
|
||||
#if CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
if (IsForwardTransformation() && HasAltivec())
|
||||
{
|
||||
AlignedSecBlock presplat(m_rkeys.size()*4);
|
||||
for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
|
||||
presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
|
||||
m_rkeys.swap(presplat);
|
||||
}
|
||||
#elif CRYPTOPP_SSE41_AVAILABLE
|
||||
if (IsForwardTransformation() && HasSSE41())
|
||||
{
|
||||
AlignedSecBlock presplat(m_rkeys.size()*4);
|
||||
for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
|
||||
presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
|
||||
m_rkeys.swap(presplat);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
|
||||
}
|
||||
|
||||
void SPECK64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||
@ -505,50 +440,6 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
||||
OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
|
||||
size_t SPECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
if (HasSSE41())
|
||||
return SPECK64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return SPECK64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SPECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
if (HasSSE41())
|
||||
return SPECK64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return SPECK64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
#if (CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS)
|
||||
size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
|
18
speck.h
18
speck.h
@ -17,14 +17,6 @@
|
||||
#include "seckey.h"
|
||||
#include "secblock.h"
|
||||
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
|
||||
CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
|
||||
CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
|
||||
# ifndef CRYPTOPP_DISABLE_SPECK_SIMD
|
||||
# define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
|
||||
CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
|
||||
CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
|
||||
@ -36,13 +28,9 @@
|
||||
// Yet another SunStudio/SunCC workaround. Failed self tests
|
||||
// in SSE code paths on i386 for SunStudio 12.3 and below.
|
||||
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
|
||||
# undef CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
|
||||
# undef CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS
|
||||
#endif
|
||||
|
||||
// https://github.com/weidai11/cryptopp/issues/945
|
||||
#undef CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
/// \brief SPECK block cipher information
|
||||
@ -129,9 +117,6 @@ public:
|
||||
{
|
||||
public:
|
||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||
#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
|
||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||
#endif
|
||||
};
|
||||
|
||||
/// \brief SPECK64 decryption transformation
|
||||
@ -142,9 +127,6 @@ public:
|
||||
{
|
||||
public:
|
||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||
#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
|
||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
|
||||
|
781
speck64_simd.cpp
781
speck64_simd.cpp
@ -1,781 +0,0 @@
|
||||
// speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
|
||||
//
|
||||
// This source file uses intrinsics and built-ins to gain access to
|
||||
// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
|
||||
// source file is needed because additional CXXFLAGS are required to enable
|
||||
// the appropriate instructions sets in some build configurations.
|
||||
|
||||
#include "pch.h"
|
||||
#include "config.h"
|
||||
|
||||
#include "speck.h"
|
||||
#include "misc.h"
|
||||
|
||||
// Uncomment for benchmarking C++ against SSE or NEON.
|
||||
// Do so in both speck.cpp and speck_simd.cpp.
|
||||
// #undef CRYPTOPP_SSE41_AVAILABLE
|
||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
# include "adv_simd.h"
|
||||
# include <pmmintrin.h>
|
||||
# include <tmmintrin.h>
|
||||
# include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__XOP__)
|
||||
# include <ammintrin.h>
|
||||
# if defined(__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_HEADER)
|
||||
# include "adv_simd.h"
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ARM_ACLE_HEADER)
|
||||
# include <stdint.h>
|
||||
# include <arm_acle.h>
|
||||
#endif
|
||||
|
||||
#if defined(_M_ARM64)
|
||||
# include "adv_simd.h"
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
# include "adv_simd.h"
|
||||
# include "ppc_simd.h"
|
||||
#endif
|
||||
|
||||
// Squash MS LNK4221 and libtool warnings
|
||||
extern const char SPECK64_SIMD_FNAME[] = __FILE__;
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
using CryptoPP::byte;
|
||||
using CryptoPP::word32;
|
||||
using CryptoPP::word64;
|
||||
|
||||
// *************************** ARM NEON ************************** //
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
|
||||
template <class T>
|
||||
inline T UnpackHigh32(const T& a, const T& b)
|
||||
{
|
||||
const uint32x2_t x(vget_high_u32((uint32x4_t)a));
|
||||
const uint32x2_t y(vget_high_u32((uint32x4_t)b));
|
||||
const uint32x2x2_t r = vzip_u32(x, y);
|
||||
return (T)vcombine_u32(r.val[0], r.val[1]);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline T UnpackLow32(const T& a, const T& b)
|
||||
{
|
||||
const uint32x2_t x(vget_low_u32((uint32x4_t)a));
|
||||
const uint32x2_t y(vget_low_u32((uint32x4_t)b));
|
||||
const uint32x2x2_t r = vzip_u32(x, y);
|
||||
return (T)vcombine_u32(r.val[0], r.val[1]);
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
||||
{
|
||||
const uint32x4_t a(vshlq_n_u32(val, R));
|
||||
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
|
||||
return vorrq_u32(a, b);
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline uint32x4_t RotateRight32(const uint32x4_t& val)
|
||||
{
|
||||
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
|
||||
const uint32x4_t b(vshrq_n_u32(val, R));
|
||||
return vorrq_u32(a, b);
|
||||
}
|
||||
|
||||
#if defined(__aarch32__) || defined(__aarch64__)
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
|
||||
{
|
||||
const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
|
||||
const uint8x16_t mask = vld1q_u8(maskb);
|
||||
|
||||
return vreinterpretq_u32_u8(
|
||||
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
|
||||
{
|
||||
const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
|
||||
const uint8x16_t mask = vld1q_u8(maskb);
|
||||
|
||||
return vreinterpretq_u32_u8(
|
||||
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
|
||||
}
|
||||
#endif // Aarch32 or Aarch64
|
||||
|
||||
inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
|
||||
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
|
||||
{
|
||||
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
|
||||
|
||||
x1 = RotateRight32<8>(x1);
|
||||
x1 = vaddq_u32(x1, y1);
|
||||
x1 = veorq_u32(x1, rk);
|
||||
y1 = RotateLeft32<3>(y1);
|
||||
y1 = veorq_u32(y1, x1);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
|
||||
|
||||
y1 = veorq_u32(y1, x1);
|
||||
y1 = RotateRight32<3>(y1);
|
||||
x1 = veorq_u32(x1, rk);
|
||||
x1 = vsubq_u32(x1, y1);
|
||||
x1 = RotateLeft32<8>(x1);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
|
||||
|
||||
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
|
||||
{
|
||||
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
|
||||
|
||||
x1 = RotateRight32<8>(x1);
|
||||
x2 = RotateRight32<8>(x2);
|
||||
x3 = RotateRight32<8>(x3);
|
||||
x1 = vaddq_u32(x1, y1);
|
||||
x2 = vaddq_u32(x2, y2);
|
||||
x3 = vaddq_u32(x3, y3);
|
||||
x1 = veorq_u32(x1, rk);
|
||||
x2 = veorq_u32(x2, rk);
|
||||
x3 = veorq_u32(x3, rk);
|
||||
y1 = RotateLeft32<3>(y1);
|
||||
y2 = RotateLeft32<3>(y2);
|
||||
y3 = RotateLeft32<3>(y3);
|
||||
y1 = veorq_u32(y1, x1);
|
||||
y2 = veorq_u32(y2, x2);
|
||||
y3 = veorq_u32(y3, x3);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
block2 = UnpackLow32(y2, x2);
|
||||
block3 = UnpackHigh32(y2, x2);
|
||||
block4 = UnpackLow32(y3, x3);
|
||||
block5 = UnpackHigh32(y3, x3);
|
||||
}
|
||||
|
||||
inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
|
||||
|
||||
y1 = veorq_u32(y1, x1);
|
||||
y2 = veorq_u32(y2, x2);
|
||||
y3 = veorq_u32(y3, x3);
|
||||
y1 = RotateRight32<3>(y1);
|
||||
y2 = RotateRight32<3>(y2);
|
||||
y3 = RotateRight32<3>(y3);
|
||||
x1 = veorq_u32(x1, rk);
|
||||
x2 = veorq_u32(x2, rk);
|
||||
x3 = veorq_u32(x3, rk);
|
||||
x1 = vsubq_u32(x1, y1);
|
||||
x2 = vsubq_u32(x2, y2);
|
||||
x3 = vsubq_u32(x3, y3);
|
||||
x1 = RotateLeft32<8>(x1);
|
||||
x2 = RotateLeft32<8>(x2);
|
||||
x3 = RotateLeft32<8>(x3);
|
||||
}
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
block2 = UnpackLow32(y2, x2);
|
||||
block3 = UnpackHigh32(y2, x2);
|
||||
block4 = UnpackLow32(y3, x3);
|
||||
block5 = UnpackHigh32(y3, x3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
// ***************************** IA-32 ***************************** //
|
||||
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
|
||||
#ifndef M128_CAST
|
||||
# define M128_CAST(x) ((__m128i *)(void *)(x))
|
||||
#endif
|
||||
#ifndef CONST_M128_CAST
|
||||
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||
#endif
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateLeft32(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateRight32(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
__m128i RotateLeft32<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
__m128i RotateRight32<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
const __m128 t0 = _mm_castsi128_ps(block0);
|
||||
const __m128 t1 = _mm_castsi128_ps(block1);
|
||||
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
|
||||
|
||||
x1 = RotateRight32<8>(x1);
|
||||
x1 = _mm_add_epi32(x1, y1);
|
||||
x1 = _mm_xor_si128(x1, rk);
|
||||
y1 = RotateLeft32<3>(y1);
|
||||
y1 = _mm_xor_si128(y1, x1);
|
||||
}
|
||||
|
||||
// The is roughly the SSE equivalent to ARM vzp32
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = _mm_unpacklo_epi32(y1, x1);
|
||||
block1 = _mm_unpackhi_epi32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
const __m128 t0 = _mm_castsi128_ps(block0);
|
||||
const __m128 t1 = _mm_castsi128_ps(block1);
|
||||
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
const __m128i rk = _mm_set1_epi32(subkeys[i]);
|
||||
|
||||
y1 = _mm_xor_si128(y1, x1);
|
||||
y1 = RotateRight32<3>(y1);
|
||||
x1 = _mm_xor_si128(x1, rk);
|
||||
x1 = _mm_sub_epi32(x1, y1);
|
||||
x1 = RotateLeft32<8>(x1);
|
||||
}
|
||||
|
||||
// The is roughly the SSE equivalent to ARM vzp32
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = _mm_unpacklo_epi32(y1, x1);
|
||||
block1 = _mm_unpackhi_epi32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
const __m128 t0 = _mm_castsi128_ps(block0);
|
||||
const __m128 t1 = _mm_castsi128_ps(block1);
|
||||
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
const __m128 t2 = _mm_castsi128_ps(block2);
|
||||
const __m128 t3 = _mm_castsi128_ps(block3);
|
||||
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
const __m128 t4 = _mm_castsi128_ps(block4);
|
||||
const __m128 t5 = _mm_castsi128_ps(block5);
|
||||
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
|
||||
|
||||
x1 = RotateRight32<8>(x1);
|
||||
x2 = RotateRight32<8>(x2);
|
||||
x3 = RotateRight32<8>(x3);
|
||||
x1 = _mm_add_epi32(x1, y1);
|
||||
x2 = _mm_add_epi32(x2, y2);
|
||||
x3 = _mm_add_epi32(x3, y3);
|
||||
x1 = _mm_xor_si128(x1, rk);
|
||||
x2 = _mm_xor_si128(x2, rk);
|
||||
x3 = _mm_xor_si128(x3, rk);
|
||||
y1 = RotateLeft32<3>(y1);
|
||||
y2 = RotateLeft32<3>(y2);
|
||||
y3 = RotateLeft32<3>(y3);
|
||||
y1 = _mm_xor_si128(y1, x1);
|
||||
y2 = _mm_xor_si128(y2, x2);
|
||||
y3 = _mm_xor_si128(y3, x3);
|
||||
}
|
||||
|
||||
// The is roughly the SSE equivalent to ARM vzp32
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = _mm_unpacklo_epi32(y1, x1);
|
||||
block1 = _mm_unpackhi_epi32(y1, x1);
|
||||
block2 = _mm_unpacklo_epi32(y2, x2);
|
||||
block3 = _mm_unpackhi_epi32(y2, x2);
|
||||
block4 = _mm_unpacklo_epi32(y3, x3);
|
||||
block5 = _mm_unpackhi_epi32(y3, x3);
|
||||
}
|
||||
|
||||
inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
const __m128 t0 = _mm_castsi128_ps(block0);
|
||||
const __m128 t1 = _mm_castsi128_ps(block1);
|
||||
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
const __m128 t2 = _mm_castsi128_ps(block2);
|
||||
const __m128 t3 = _mm_castsi128_ps(block3);
|
||||
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
const __m128 t4 = _mm_castsi128_ps(block4);
|
||||
const __m128 t5 = _mm_castsi128_ps(block5);
|
||||
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
|
||||
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
const __m128i rk = _mm_set1_epi32(subkeys[i]);
|
||||
|
||||
y1 = _mm_xor_si128(y1, x1);
|
||||
y2 = _mm_xor_si128(y2, x2);
|
||||
y3 = _mm_xor_si128(y3, x3);
|
||||
y1 = RotateRight32<3>(y1);
|
||||
y2 = RotateRight32<3>(y2);
|
||||
y3 = RotateRight32<3>(y3);
|
||||
x1 = _mm_xor_si128(x1, rk);
|
||||
x2 = _mm_xor_si128(x2, rk);
|
||||
x3 = _mm_xor_si128(x3, rk);
|
||||
x1 = _mm_sub_epi32(x1, y1);
|
||||
x2 = _mm_sub_epi32(x2, y2);
|
||||
x3 = _mm_sub_epi32(x3, y3);
|
||||
x1 = RotateLeft32<8>(x1);
|
||||
x2 = RotateLeft32<8>(x2);
|
||||
x3 = RotateLeft32<8>(x3);
|
||||
}
|
||||
|
||||
// The is roughly the SSE equivalent to ARM vzp32
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = _mm_unpacklo_epi32(y1, x1);
|
||||
block1 = _mm_unpackhi_epi32(y1, x1);
|
||||
block2 = _mm_unpacklo_epi32(y2, x2);
|
||||
block3 = _mm_unpackhi_epi32(y2, x2);
|
||||
block4 = _mm_unpacklo_epi32(y3, x3);
|
||||
block5 = _mm_unpackhi_epi32(y3, x3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_SSE41_AVAILABLE
|
||||
|
||||
// ***************************** Altivec ***************************** //
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
using CryptoPP::uint8x16_p;
|
||||
using CryptoPP::uint32x4_p;
|
||||
|
||||
using CryptoPP::VecAdd;
|
||||
using CryptoPP::VecSub;
|
||||
using CryptoPP::VecXor;
|
||||
using CryptoPP::VecLoad;
|
||||
using CryptoPP::VecLoadAligned;
|
||||
using CryptoPP::VecPermute;
|
||||
|
||||
// Rotate left by bit count
|
||||
template<unsigned int C>
|
||||
inline uint32x4_p RotateLeft32(const uint32x4_p val)
|
||||
{
|
||||
const uint32x4_p m = {C, C, C, C};
|
||||
return vec_rl(val, m);
|
||||
}
|
||||
|
||||
// Rotate right by bit count
|
||||
template<unsigned int C>
|
||||
inline uint32x4_p RotateRight32(const uint32x4_p val)
|
||||
{
|
||||
const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
|
||||
return vec_rl(val, m);
|
||||
}
|
||||
|
||||
void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
#else
|
||||
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_p x1 = VecPermute(block0, block1, m1);
|
||||
uint32x4_p y1 = VecPermute(block0, block1, m2);
|
||||
|
||||
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint32x4_p rk = VecLoadAligned(subkeys+i*4);
|
||||
|
||||
x1 = RotateRight32<8>(x1);
|
||||
x1 = VecAdd(x1, y1);
|
||||
x1 = VecXor(x1, rk);
|
||||
|
||||
y1 = RotateLeft32<3>(y1);
|
||||
y1 = VecXor(y1, x1);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
|
||||
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
|
||||
#else
|
||||
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
|
||||
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||
}
|
||||
|
||||
void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
#else
|
||||
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_p x1 = VecPermute(block0, block1, m1);
|
||||
uint32x4_p y1 = VecPermute(block0, block1, m2);
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
#if defined(_ARCH_PWR7)
|
||||
const uint32x4_p rk = vec_splats(subkeys[i]);
|
||||
#else
|
||||
// subkeys has extra elements so memory backs the last subkey
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk = VecLoad(subkeys+i);
|
||||
rk = VecPermute(rk, rk, m);
|
||||
#endif
|
||||
|
||||
y1 = VecXor(y1, x1);
|
||||
y1 = RotateRight32<3>(y1);
|
||||
|
||||
x1 = VecXor(x1, rk);
|
||||
x1 = VecSub(x1, y1);
|
||||
x1 = RotateLeft32<8>(x1);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
|
||||
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
|
||||
#else
|
||||
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
|
||||
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||
}
|
||||
|
||||
void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
|
||||
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
#else
|
||||
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
|
||||
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
|
||||
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
|
||||
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
|
||||
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
|
||||
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
|
||||
|
||||
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint32x4_p rk = VecLoadAligned(subkeys+i*4);
|
||||
|
||||
x1 = RotateRight32<8>(x1);
|
||||
x2 = RotateRight32<8>(x2);
|
||||
x3 = RotateRight32<8>(x3);
|
||||
|
||||
x1 = VecAdd(x1, y1);
|
||||
x2 = VecAdd(x2, y2);
|
||||
x3 = VecAdd(x3, y3);
|
||||
|
||||
x1 = VecXor(x1, rk);
|
||||
x2 = VecXor(x2, rk);
|
||||
x3 = VecXor(x3, rk);
|
||||
|
||||
y1 = RotateLeft32<3>(y1);
|
||||
y2 = RotateLeft32<3>(y2);
|
||||
y3 = RotateLeft32<3>(y3);
|
||||
|
||||
y1 = VecXor(y1, x1);
|
||||
y2 = VecXor(y2, x2);
|
||||
y3 = VecXor(y3, x3);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
|
||||
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
|
||||
#else
|
||||
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
|
||||
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||
}
|
||||
|
||||
void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
|
||||
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
#else
|
||||
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
|
||||
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
|
||||
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
|
||||
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
|
||||
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
|
||||
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
|
||||
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
#if defined(_ARCH_PWR7)
|
||||
const uint32x4_p rk = vec_splats(subkeys[i]);
|
||||
#else
|
||||
// subkeys has extra elements so memory backs the last subkey
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk = VecLoad(subkeys+i);
|
||||
rk = VecPermute(rk, rk, m);
|
||||
#endif
|
||||
|
||||
y1 = VecXor(y1, x1);
|
||||
y2 = VecXor(y2, x2);
|
||||
y3 = VecXor(y3, x3);
|
||||
|
||||
y1 = RotateRight32<3>(y1);
|
||||
y2 = RotateRight32<3>(y2);
|
||||
y3 = RotateRight32<3>(y3);
|
||||
|
||||
x1 = VecXor(x1, rk);
|
||||
x2 = VecXor(x2, rk);
|
||||
x3 = VecXor(x3, rk);
|
||||
|
||||
x1 = VecSub(x1, y1);
|
||||
x2 = VecSub(x2, y2);
|
||||
x3 = VecSub(x3, y3);
|
||||
|
||||
x1 = RotateLeft32<8>(x1);
|
||||
x2 = RotateLeft32<8>(x2);
|
||||
x3 = RotateLeft32<8>(x3);
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
|
||||
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
|
||||
#else
|
||||
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
|
||||
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
|
||||
#endif
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
// *************************** ARM NEON **************************** //
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ***************************** IA-32 ***************************** //
|
||||
|
||||
#if (CRYPTOPP_SSE41_AVAILABLE)
|
||||
size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ***************************** Altivec ***************************** //
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
NAMESPACE_END
|
Loading…
Reference in New Issue
Block a user