Remove 64-bit AdvancedProcessBlocks (GH #945)

This commit is contained in:
Jeffrey Walton 2020-07-07 15:22:09 -04:00
parent 84ab419029
commit dd7598e638
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
19 changed files with 25 additions and 4130 deletions

View File

@ -334,10 +334,8 @@ simple.cpp
simple.h
siphash.h
simeck.cpp
simeck_simd.cpp
simeck.h
simon.cpp
simon64_simd.cpp
simon128_simd.cpp
simon.h
skipjack.cpp
@ -351,7 +349,6 @@ smartptr.h
sosemanuk.cpp
sosemanuk.h
speck.cpp
speck64_simd.cpp
speck128_simd.cpp
speck.h
square.cpp

View File

@ -292,7 +292,6 @@ ifeq ($(DETECT_FEATURES),1)
CHAM_FLAG = $(SSSE3_FLAG)
KECCAK_FLAG = $(SSSE3_FLAG)
LEA_FLAG = $(SSSE3_FLAG)
SIMECK_FLAG = $(SSSE3_FLAG)
SIMON128_FLAG = $(SSSE3_FLAG)
SPECK128_FLAG = $(SSSE3_FLAG)
SUN_LDFLAGS += $(SSSE3_FLAG)
@ -306,8 +305,6 @@ ifeq ($(DETECT_FEATURES),1)
ifeq ($(strip $(HAVE_OPT)),0)
BLAKE2B_FLAG = $(SSE41_FLAG)
BLAKE2S_FLAG = $(SSE41_FLAG)
SIMON64_FLAG = $(SSE41_FLAG)
SPECK64_FLAG = $(SSE41_FLAG)
SUN_LDFLAGS += $(SSE41_FLAG)
else
SSE41_FLAG =
@ -478,10 +475,7 @@ ifeq ($(DETECT_FEATURES),1)
CHAM_FLAG = -march=armv7-a -mfpu=neon
LEA_FLAG = -march=armv7-a -mfpu=neon
SHA_FLAG = -march=armv7-a -mfpu=neon
SIMECK_FLAG = -march=armv7-a -mfpu=neon
SIMON64_FLAG = -march=armv7-a -mfpu=neon
SIMON128_FLAG = -march=armv7-a -mfpu=neon
SPECK64_FLAG = -march=armv7-a -mfpu=neon
SPECK128_FLAG = -march=armv7-a -mfpu=neon
SM4_FLAG = -march=armv7-a -mfpu=neon
else
@ -521,10 +515,7 @@ ifeq ($(DETECT_FEATURES),1)
CHAM_FLAG = -march=armv8-a
LEA_FLAG = -march=armv8-a
NEON_FLAG = -march=armv8-a
SIMECK_FLAG = -march=armv8-a
SIMON64_FLAG = -march=armv8-a
SIMON128_FLAG = -march=armv8-a
SPECK64_FLAG = -march=armv8-a
SPECK128_FLAG = -march=armv8-a
SM4_FLAG = -march=armv8-a
else
@ -658,7 +649,6 @@ ifeq ($(DETECT_FEATURES),1)
LEA_FLAG = $(POWER8_FLAG)
SHA_FLAG = $(POWER8_FLAG)
SHACAL2_FLAG = $(POWER8_FLAG)
SIMECK_FLAG = $(POWER8_FLAG)
else
POWER8_FLAG =
endif
@ -724,8 +714,6 @@ ifeq ($(DETECT_FEATURES),1)
ifneq ($(ALTIVEC_FLAG),)
BLAKE2S_FLAG = $(ALTIVEC_FLAG)
CHACHA_FLAG = $(ALTIVEC_FLAG)
SIMON64_FLAG = $(ALTIVEC_FLAG)
SPECK64_FLAG = $(ALTIVEC_FLAG)
SPECK128_FLAG = $(ALTIVEC_FLAG)
SIMON128_FLAG = $(ALTIVEC_FLAG)
endif
@ -1612,22 +1600,10 @@ sha3_simd.o : sha3_simd.cpp
shacal2_simd.o : shacal2_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SHA_FLAG) -c) $<
# SSSE3 or NEON available
simeck_simd.o : simeck_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMECK_FLAG) -c) $<
# SSE4.1, NEON or POWER7 available
simon64_simd.o : simon64_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON64_FLAG) -c) $<
# SSSE3, NEON or POWER8 available
simon128_simd.o : simon128_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON128_FLAG) -c) $<
# SSE4.1, NEON or POWER7 available
speck64_simd.o : speck64_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK64_FLAG) -c) $<
# SSSE3, NEON or POWER8 available
speck128_simd.o : speck128_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK128_FLAG) -c) $<

View File

@ -241,7 +241,6 @@ ifeq ($(DETECT_FEATURES),1)
ARIA_FLAG = $(SSSE3_FLAG)
CHAM_FLAG = $(SSSE3_FLAG)
LEA_FLAG = $(SSSE3_FLAG)
SIMECK_FLAG = $(SSSE3_FLAG)
SIMON128_FLAG = $(SSSE3_FLAG)
SPECK128_FLAG = $(SSSE3_FLAG)
else
@ -254,8 +253,6 @@ ifeq ($(DETECT_FEATURES),1)
ifeq ($(strip $(HAVE_OPT)),0)
BLAKE2B_FLAG = $(SSE41_FLAG)
BLAKE2S_FLAG = $(SSE41_FLAG)
SIMON64_FLAG = $(SSE41_FLAG)
SPECK64_FLAG = $(SSE41_FLAG)
else
SSE41_FLAG =
endif
@ -400,10 +397,7 @@ ifeq ($(DETECT_FEATURES),1)
CHAM_FLAG = $(NEON_FLAG)
LEA_FLAG = $(NEON_FLAG)
SHA_FLAG = $(NEON_FLAG)
SIMECK_FLAG = $(NEON_FLAG)
SIMON64_FLAG = $(NEON_FLAG)
SIMON128_FLAG = $(NEON_FLAG)
SPECK64_FLAG = $(NEON_FLAG)
SPECK128_FLAG = $(NEON_FLAG)
SM4_FLAG = $(NEON_FLAG)
else
@ -457,10 +451,7 @@ ifeq ($(DETECT_FEATURES),1)
CHAM_FLAG = $(ASIMD_FLAG)
LEA_FLAG = $(ASIMD_FLAG)
NEON_FLAG = $(ASIMD_FLAG)
SIMECK_FLAG = $(ASIMD_FLAG)
SIMON64_FLAG = $(ASIMD_FLAG)
SIMON128_FLAG = $(ASIMD_FLAG)
SPECK64_FLAG = $(ASIMD_FLAG)
SPECK128_FLAG = $(ASIMD_FLAG)
SM4_FLAG = $(ASIMD_FLAG)
else
@ -933,22 +924,10 @@ sha512_armv4.o : sha512_armv4.S
shacal2_simd.o : shacal2_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
# SSSE3 or NEON available
simeck_simd.o : simeck_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SIMECK_FLAG) -c) $<
# SSE4.1, NEON or POWER7 available
simon64_simd.o : simon64_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SIMON64_FLAG) -c) $<
# SSSE3, NEON or POWER8 available
simon128_simd.o : simon128_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SIMON128_FLAG) -c) $<
# SSE4.1, NEON or POWER7 available
speck64_simd.o : speck64_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SPECK64_FLAG) -c) $<
# SSSE3, NEON or POWER8 available
speck128_simd.o : speck128_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SPECK128_FLAG) -c) $<

1116
adv_simd.h

File diff suppressed because it is too large Load Diff

View File

@ -96,7 +96,7 @@ ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
# if (CRYPTOPP_SSSE3_AVAILABLE)
extern size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -110,11 +110,11 @@ extern size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, siz
extern size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
# endif // CRYPTOPP_SSSE3_AVAILABLE
#endif // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
#endif // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
std::string CHAM64::Base::AlgorithmProvider() const
{
#if (CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS)
#if (CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS)
# if defined(CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3())
return "SSSE3";
@ -336,31 +336,7 @@ void CHAM128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock,
oblock(m_x[0])(m_x[1])(m_x[2])(m_x[3]);
}
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
size_t CHAM64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
# if (CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3()) {
return CHAM64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, 80,
inBlocks, xorBlocks, outBlocks, length, flags);
}
# endif // CRYPTOPP_SSSE3_AVAILABLE
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t CHAM64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
# if (CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3()) {
return CHAM64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, 80,
inBlocks, xorBlocks, outBlocks, length, flags);
}
# endif // CRYPTOPP_SSSE3_AVAILABLE
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
size_t CHAM128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
@ -386,6 +362,6 @@ size_t CHAM128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor
# endif // CRYPTOPP_SSSE3_AVAILABLE
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
#endif // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
NAMESPACE_END

19
cham.h
View File

@ -16,18 +16,15 @@
#include "algparam.h"
#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
# define CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS 1
# define CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS 1
#endif
// Yet another SunStudio/SunCC workaround. Failed self tests
// in SSE code paths on i386 for SunStudio 12.3 and below.
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
# undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
# undef CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
#endif
// https://github.com/weidai11/cryptopp/issues/945
#undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
NAMESPACE_BEGIN(CryptoPP)
/// \brief CHAM block cipher information
@ -92,10 +89,6 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
/// \brief Decryption transformation
@ -106,10 +99,6 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
/// \brief CHAM64 encryption
@ -156,7 +145,7 @@ public:
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
@ -170,7 +159,7 @@ public:
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};

View File

@ -45,600 +45,6 @@ using CryptoPP::word32;
//////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size
template <unsigned int R>
inline __m128i RotateLeft16(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi16(val, R);
#else
return _mm_or_si128(
_mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
#endif
}
template <unsigned int R>
inline __m128i RotateRight16(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi16(val, 16-R);
#else
return _mm_or_si128(
_mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
#endif
}
template <>
inline __m128i RotateLeft16<8>(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi16(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
return _mm_shuffle_epi8(val, mask);
#endif
}
template <>
inline __m128i RotateRight16<8>(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi16(val, 16-8);
#else
const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
return _mm_shuffle_epi8(val, mask);
#endif
}
template <unsigned int IDX>
inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// Should not be instantiated
CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
CRYPTOPP_ASSERT(0);
return _mm_setzero_si128();
}
template <>
inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// The shuffle converts to and from little-endian for SSE. A specialized
// CHAM implementation can avoid the shuffle by framing the data for
// encryption, decryption and benchmarks. The library cannot take the
// speed-up because of the byte oriented API.
const __m128i r1 = _mm_unpacklo_epi16(a, b);
const __m128i r2 = _mm_unpacklo_epi16(c, d);
const __m128i r3 = _mm_unpacklo_epi16(e, f);
const __m128i r4 = _mm_unpacklo_epi16(g, h);
const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
}
template <>
inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// The shuffle converts to and from little-endian for SSE. A specialized
// CHAM implementation can avoid the shuffle by framing the data for
// encryption, decryption and benchmarks. The library cannot take the
// speed-up because of the byte oriented API.
const __m128i r1 = _mm_unpacklo_epi16(a, b);
const __m128i r2 = _mm_unpacklo_epi16(c, d);
const __m128i r3 = _mm_unpacklo_epi16(e, f);
const __m128i r4 = _mm_unpacklo_epi16(g, h);
const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
}
template <>
inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// The shuffle converts to and from little-endian for SSE. A specialized
// CHAM implementation can avoid the shuffle by framing the data for
// encryption, decryption and benchmarks. The library cannot take the
// speed-up because of the byte oriented API.
const __m128i r1 = _mm_unpacklo_epi16(a, b);
const __m128i r2 = _mm_unpacklo_epi16(c, d);
const __m128i r3 = _mm_unpacklo_epi16(e, f);
const __m128i r4 = _mm_unpacklo_epi16(g, h);
const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
}
template <>
inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// The shuffle converts to and from little-endian for SSE. A specialized
// CHAM implementation can avoid the shuffle by framing the data for
// encryption, decryption and benchmarks. The library cannot take the
// speed-up because of the byte oriented API.
const __m128i r1 = _mm_unpacklo_epi16(a, b);
const __m128i r2 = _mm_unpacklo_epi16(c, d);
const __m128i r3 = _mm_unpacklo_epi16(e, f);
const __m128i r4 = _mm_unpacklo_epi16(g, h);
const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
}
template <>
inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// The shuffle converts to and from little-endian for SSE. A specialized
// CHAM implementation can avoid the shuffle by framing the data for
// encryption, decryption and benchmarks. The library cannot take the
// speed-up because of the byte oriented API.
const __m128i r1 = _mm_unpackhi_epi16(a, b);
const __m128i r2 = _mm_unpackhi_epi16(c, d);
const __m128i r3 = _mm_unpackhi_epi16(e, f);
const __m128i r4 = _mm_unpackhi_epi16(g, h);
const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
}
template <>
inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// The shuffle converts to and from little-endian for SSE. A specialized
// CHAM implementation can avoid the shuffle by framing the data for
// encryption, decryption and benchmarks. The library cannot take the
// speed-up because of the byte oriented API.
const __m128i r1 = _mm_unpackhi_epi16(a, b);
const __m128i r2 = _mm_unpackhi_epi16(c, d);
const __m128i r3 = _mm_unpackhi_epi16(e, f);
const __m128i r4 = _mm_unpackhi_epi16(g, h);
const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
}
template <>
inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// The shuffle converts to and from little-endian for SSE. A specialized
// CHAM implementation can avoid the shuffle by framing the data for
// encryption, decryption and benchmarks. The library cannot take the
// speed-up because of the byte oriented API.
const __m128i r1 = _mm_unpackhi_epi16(a, b);
const __m128i r2 = _mm_unpackhi_epi16(c, d);
const __m128i r3 = _mm_unpackhi_epi16(e, f);
const __m128i r4 = _mm_unpackhi_epi16(g, h);
const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
}
template <>
inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
// The shuffle converts to and from little-endian for SSE. A specialized
// CHAM implementation can avoid the shuffle by framing the data for
// encryption, decryption and benchmarks. The library cannot take the
// speed-up because of the byte oriented API.
const __m128i r1 = _mm_unpackhi_epi16(a, b);
const __m128i r2 = _mm_unpackhi_epi16(c, d);
const __m128i r3 = _mm_unpackhi_epi16(e, f);
const __m128i r4 = _mm_unpackhi_epi16(g, h);
const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
_mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
}
template <unsigned int IDX>
inline __m128i UnpackXMM(const __m128i& v)
{
// Should not be instantiated
CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
return _mm_setzero_si128();
}
template <>
inline __m128i UnpackXMM<0>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
}
template <>
inline __m128i UnpackXMM<1>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
}
template <>
inline __m128i UnpackXMM<2>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
}
template <>
inline __m128i UnpackXMM<3>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
}
template <>
inline __m128i UnpackXMM<4>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
}
template <>
inline __m128i UnpackXMM<5>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
}
template <>
inline __m128i UnpackXMM<6>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
}
template <>
inline __m128i UnpackXMM<7>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
}
template <unsigned int IDX>
inline __m128i UnpackXMM(const __m128i& a, const __m128i& b)
{
const __m128i& z = _mm_setzero_si128();
return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
}
template <unsigned int IDX>
inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
{
return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
}
template <unsigned int IDX>
inline __m128i RepackXMM(const __m128i& v)
{
return UnpackXMM<IDX>(v);
}
inline void CHAM64_Enc_Block(__m128i &block0,
const word16 *subkeys, unsigned int /*rounds*/)
{
// Rearrange the data for vectorization. UnpackXMM includes a
// little-endian swap for SSE. Thanks to Peter Cordes for help
// with packing and unpacking.
// [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
__m128i a = UnpackXMM<0>(block0);
__m128i b = UnpackXMM<1>(block0);
__m128i c = UnpackXMM<2>(block0);
__m128i d = UnpackXMM<3>(block0);
__m128i e = UnpackXMM<4>(block0);
__m128i f = UnpackXMM<5>(block0);
__m128i g = UnpackXMM<6>(block0);
__m128i h = UnpackXMM<7>(block0);
const unsigned int rounds = 80;
__m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
__m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
const unsigned int MASK = 15;
for (int i=0; i<static_cast<int>(rounds); i+=4)
{
__m128i k, kr, t1, t2, t3, t4;
k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
// Shuffle out key
kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
t1 = _mm_xor_si128(a, counter);
t3 = _mm_xor_si128(e, counter);
t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
counter = _mm_add_epi16(counter, increment);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
t1 = _mm_xor_si128(b, counter);
t3 = _mm_xor_si128(f, counter);
t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
counter = _mm_add_epi16(counter, increment);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
t1 = _mm_xor_si128(c, counter);
t3 = _mm_xor_si128(g, counter);
t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
counter = _mm_add_epi16(counter, increment);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
t1 = _mm_xor_si128(d, counter);
t3 = _mm_xor_si128(h, counter);
t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
counter = _mm_add_epi16(counter, increment);
}
// [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
}
inline void CHAM64_Dec_Block(__m128i &block0,
const word16 *subkeys, unsigned int /*rounds*/)
{
// Rearrange the data for vectorization. UnpackXMM includes a
// little-endian swap for SSE. Thanks to Peter Cordes for help
// with packing and unpacking.
// [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
__m128i a = UnpackXMM<0>(block0);
__m128i b = UnpackXMM<1>(block0);
__m128i c = UnpackXMM<2>(block0);
__m128i d = UnpackXMM<3>(block0);
__m128i e = UnpackXMM<4>(block0);
__m128i f = UnpackXMM<5>(block0);
__m128i g = UnpackXMM<6>(block0);
__m128i h = UnpackXMM<7>(block0);
const unsigned int rounds = 80;
__m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
__m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
const unsigned int MASK = 15;
for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
{
__m128i k, kr, t1, t2, t3, t4;
k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
// Shuffle out key
kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
// Odd round
t1 = RotateRight16<1>(d);
t3 = RotateRight16<1>(h);
t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
counter = _mm_sub_epi16(counter, decrement);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
// Even round
t1 = RotateRight16<8>(c);
t3 = RotateRight16<8>(g);
t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
counter = _mm_sub_epi16(counter, decrement);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
// Odd round
t1 = RotateRight16<1>(b);
t3 = RotateRight16<1>(f);
t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
counter = _mm_sub_epi16(counter, decrement);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
// Even round
t1 = RotateRight16<8>(a);
t3 = RotateRight16<8>(e);
t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
counter = _mm_sub_epi16(counter, decrement);
}
// [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
}
inline void CHAM64_Enc_2_Blocks(__m128i &block0,
__m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
{
// Rearrange the data for vectorization. UnpackXMM includes a
// little-endian swap for SSE. Thanks to Peter Cordes for help
// with packing and unpacking.
// [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
__m128i a = UnpackXMM<0>(block0, block1);
__m128i b = UnpackXMM<1>(block0, block1);
__m128i c = UnpackXMM<2>(block0, block1);
__m128i d = UnpackXMM<3>(block0, block1);
__m128i e = UnpackXMM<4>(block0, block1);
__m128i f = UnpackXMM<5>(block0, block1);
__m128i g = UnpackXMM<6>(block0, block1);
__m128i h = UnpackXMM<7>(block0, block1);
const unsigned int rounds = 80;
__m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
__m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
const unsigned int MASK = 15;
for (int i=0; i<static_cast<int>(rounds); i+=4)
{
__m128i k, kr, t1, t2, t3, t4;
k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
// Shuffle out key
kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
t1 = _mm_xor_si128(a, counter);
t3 = _mm_xor_si128(e, counter);
t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
counter = _mm_add_epi16(counter, increment);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
t1 = _mm_xor_si128(b, counter);
t3 = _mm_xor_si128(f, counter);
t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
counter = _mm_add_epi16(counter, increment);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
t1 = _mm_xor_si128(c, counter);
t3 = _mm_xor_si128(g, counter);
t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
counter = _mm_add_epi16(counter, increment);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
t1 = _mm_xor_si128(d, counter);
t3 = _mm_xor_si128(h, counter);
t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
counter = _mm_add_epi16(counter, increment);
}
// [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
}
inline void CHAM64_Dec_2_Blocks(__m128i &block0,
__m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
{
// Rearrange the data for vectorization. UnpackXMM includes a
// little-endian swap for SSE. Thanks to Peter Cordes for help
// with packing and unpacking.
// [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
__m128i a = UnpackXMM<0>(block0, block1);
__m128i b = UnpackXMM<1>(block0, block1);
__m128i c = UnpackXMM<2>(block0, block1);
__m128i d = UnpackXMM<3>(block0, block1);
__m128i e = UnpackXMM<4>(block0, block1);
__m128i f = UnpackXMM<5>(block0, block1);
__m128i g = UnpackXMM<6>(block0, block1);
__m128i h = UnpackXMM<7>(block0, block1);
const unsigned int rounds = 80;
__m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
__m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
const unsigned int MASK = 15;
for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
{
__m128i k, kr, t1, t2, t3, t4;
k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
// Shuffle out key
kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
// Odd round
t1 = RotateRight16<1>(d);
t3 = RotateRight16<1>(h);
t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
counter = _mm_sub_epi16(counter, decrement);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
// Even round
t1 = RotateRight16<8>(c);
t3 = RotateRight16<8>(g);
t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
counter = _mm_sub_epi16(counter, decrement);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
// Odd round
t1 = RotateRight16<1>(b);
t3 = RotateRight16<1>(f);
t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
counter = _mm_sub_epi16(counter, decrement);
kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
// Even round
t1 = RotateRight16<8>(a);
t3 = RotateRight16<8>(e);
t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
counter = _mm_sub_epi16(counter, decrement);
}
// [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
}
NAMESPACE_END // W16
//////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size
template <unsigned int R>
@ -1054,20 +460,6 @@ ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{

View File

@ -78,9 +78,9 @@ LIB_SRCS = \
rdtables.cpp rijndael.cpp rijndael_simd.cpp ripemd.cpp rng.cpp rsa.cpp \
rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp \
sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp shake.cpp \
shark.cpp sharkbox.cpp simeck.cpp simeck_simd.cpp simon.cpp \
simon128_simd.cpp simon64_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \
sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp speck64_simd.cpp \
shark.cpp sharkbox.cpp simeck.cpp simon.cpp \
simon128_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \
sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp \
square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \
threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \
vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp \
@ -109,9 +109,9 @@ LIB_OBJS = \
rdtables.obj rijndael.obj rijndael_simd.obj ripemd.obj rng.obj rsa.obj \
rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj \
sha.obj sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj shake.obj \
shark.obj sharkbox.obj simeck.obj simeck_simd.obj simon.obj \
simon128_simd.obj simon64_simd.obj skipjack.obj sm3.obj sm4.obj \
sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj speck64_simd.obj \
shark.obj sharkbox.obj simeck.obj simon.obj \
simon128_simd.obj skipjack.obj sm3.obj sm4.obj \
sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj \
square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \
threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \
vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj \

View File

@ -315,9 +315,7 @@
<ClCompile Include="shark.cpp" />
<ClCompile Include="sharkbox.cpp" />
<ClCompile Include="simeck.cpp" />
<ClCompile Include="simeck_simd.cpp" />
<ClCompile Include="simon.cpp" />
<ClCompile Include="simon64_simd.cpp" />
<ClCompile Include="simon128_simd.cpp" />
<ClCompile Include="simple.cpp" />
<ClCompile Include="skipjack.cpp" />
@ -326,7 +324,6 @@
<ClCompile Include="sm4_simd.cpp" />
<ClCompile Include="sosemanuk.cpp" />
<ClCompile Include="speck.cpp" />
<ClCompile Include="speck64_simd.cpp" />
<ClCompile Include="speck128_simd.cpp" />
<ClCompile Include="square.cpp" />
<ClCompile Include="squaretb.cpp" />

View File

@ -425,15 +425,9 @@
<ClCompile Include="simeck.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="simeck_simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="simon.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="simon64_simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="simon128_simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -455,9 +449,6 @@
<ClCompile Include="speck.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="speck64_simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="speck128_simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>

View File

@ -33,16 +33,6 @@ ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
# if (CRYPTOPP_SSSE3_AVAILABLE)
extern size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
# endif // CRYPTOPP_SSSE3_AVAILABLE
#endif // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
std::string SIMECK32::Base::AlgorithmProvider() const
{
return "C++";
@ -104,10 +94,6 @@ void SIMECK32::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
std::string SIMECK64::Base::AlgorithmProvider() const
{
#if (CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3())
return "SSSE3";
#endif
return "C++";
}
@ -165,30 +151,4 @@ void SIMECK64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
oblock(m_t[0])(m_t[1]);
}
#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
size_t SIMECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
# if (CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3()) {
return SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS,
inBlocks, xorBlocks, outBlocks, length, flags);
}
# endif // CRYPTOPP_SSSE3_AVAILABLE
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SIMECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
# if (CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3()) {
return SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS,
inBlocks, xorBlocks, outBlocks, length, flags);
}
# endif // CRYPTOPP_SSSE3_AVAILABLE
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
NAMESPACE_END

View File

@ -17,19 +17,6 @@
#include "secblock.h"
#include "algparam.h"
#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
# define CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS 1
#endif
// Yet another SunStudio/SunCC workaround. Failed self tests
// in SSE code paths on i386 for SunStudio 12.3 and below.
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
# undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
#endif
// https://github.com/weidai11/cryptopp/issues/945
#undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
NAMESPACE_BEGIN(CryptoPP)
/// \brief SIMECK block cipher information

View File

@ -1,342 +0,0 @@
// simeck_simd.cpp - written and placed in the public domain by Gangqiang Yang and Jeffrey Walton.
//
// This source file uses intrinsics and built-ins to gain access to
// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
// source file is needed because additional CXXFLAGS are required to enable
// the appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "simeck.h"
#include "misc.h"
// Uncomment for benchmarking C++ against SSE or NEON.
// Do so in both simon.cpp and simon_simd.cpp.
// #undef CRYPTOPP_SSSE3_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_SSSE3_AVAILABLE)
# include "adv_simd.h"
# include <pmmintrin.h>
# include <tmmintrin.h>
#endif
#if defined(__XOP__)
# include <ammintrin.h>
# if defined(__GNUC__)
# include <x86intrin.h>
# endif
#endif
// Squash MS LNK4221 and libtool warnings
extern const char SIMECK_SIMD_FNAME[] = __FILE__;
// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::word16;
using CryptoPP::word32;
#if (CRYPTOPP_SSSE3_AVAILABLE)
//////////////////////////////////////////////////////////////////////////
template <unsigned int R>
inline __m128i RotateLeft32(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, R);
#else
return _mm_or_si128(
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
#endif
}
template <unsigned int R>
inline __m128i RotateRight32(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-R);
#else
return _mm_or_si128(
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
#endif
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline __m128i RotateLeft32<8>(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
return _mm_shuffle_epi8(val, mask);
#endif
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline __m128i RotateRight32<8>(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-8);
#else
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
return _mm_shuffle_epi8(val, mask);
#endif
}
/// \brief Unpack XMM words
/// \tparam IDX the element from each XMM word
/// \param a the first XMM word
/// \param b the second XMM word
/// \param c the third XMM word
/// \param d the fourth XMM word
/// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation
/// equivalent to <tt>a[IDX] || b[IDX] || c[IDX] || d[IDX]</tt>.
template <unsigned int IDX>
inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
{
// Should not be instantiated
CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
CRYPTOPP_ASSERT(0);
return _mm_setzero_si128();
}
template <>
inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
{
const __m128i r1 = _mm_unpacklo_epi32(a, b);
const __m128i r2 = _mm_unpacklo_epi32(c, d);
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
}
template <>
inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
{
const __m128i r1 = _mm_unpacklo_epi32(a, b);
const __m128i r2 = _mm_unpacklo_epi32(c, d);
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
}
template <>
inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
{
const __m128i r1 = _mm_unpackhi_epi32(a, b);
const __m128i r2 = _mm_unpackhi_epi32(c, d);
return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
}
template <>
inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
{
const __m128i r1 = _mm_unpackhi_epi32(a, b);
const __m128i r2 = _mm_unpackhi_epi32(c, d);
return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
}
/// \brief Unpack a XMM word
/// \tparam IDX the element from each XMM word
/// \param v the first XMM word
/// \details UnpackXMM selects the IDX element from v and returns a concatenation
/// equivalent to <tt>v[IDX] || v[IDX] || v[IDX] || v[IDX]</tt>.
template <unsigned int IDX>
inline __m128i UnpackXMM(const __m128i& v)
{
// Should not be instantiated
CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
return _mm_setzero_si128();
}
template <>
inline __m128i UnpackXMM<0>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
}
template <>
inline __m128i UnpackXMM<1>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
}
template <>
inline __m128i UnpackXMM<2>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
}
template <>
inline __m128i UnpackXMM<3>(const __m128i& v)
{
return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
}
template <unsigned int IDX>
inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
{
return UnpackXMM<IDX>(a, b, c, d);
}
template <unsigned int IDX>
inline __m128i RepackXMM(const __m128i& v)
{
return UnpackXMM<IDX>(v);
}
inline void SIMECK64_Encrypt(__m128i &a, __m128i &b, __m128i &c, __m128i &d, const __m128i key)
{
// SunStudio 12.3 workaround
__m128i s, t; s = a; t = c;
a = _mm_xor_si128(_mm_and_si128(a, RotateLeft32<5>(a)), RotateLeft32<1>(a));
c = _mm_xor_si128(_mm_and_si128(c, RotateLeft32<5>(c)), RotateLeft32<1>(c));
a = _mm_xor_si128(a, _mm_xor_si128(b, key));
c = _mm_xor_si128(c, _mm_xor_si128(d, key));
b = s; d = t;
}
inline void SIMECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
__m128i a = UnpackXMM<0>(block0);
__m128i b = UnpackXMM<1>(block0);
__m128i c = UnpackXMM<2>(block0);
__m128i d = UnpackXMM<3>(block0);
const unsigned int rounds = 44;
for (int i = 0; i < static_cast<int>(rounds); i += 4)
{
const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
}
// [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
block0 = RepackXMM<0>(a,b,c,d);
}
inline void SIMECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/)
{
// SIMECK requires a word swap for the decryption transform
__m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1));
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
__m128i a = UnpackXMM<0>(w);
__m128i b = UnpackXMM<1>(w);
__m128i c = UnpackXMM<2>(w);
__m128i d = UnpackXMM<3>(w);
const unsigned int rounds = 44;
for (int i = static_cast<int>(rounds)-1; i >= 0; i -= 4)
{
const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
}
// [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
w = RepackXMM<0>(a,b,c,d);
block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
}
inline void SIMECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
__m128i a = UnpackXMM<0>(block0, block1, block2, block3);
__m128i b = UnpackXMM<1>(block0, block1, block2, block3);
__m128i c = UnpackXMM<2>(block0, block1, block2, block3);
__m128i d = UnpackXMM<3>(block0, block1, block2, block3);
const unsigned int rounds = 44;
for (int i = 0; i < static_cast<int>(rounds); i += 4)
{
const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
}
// [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
block0 = RepackXMM<0>(a, b, c, d);
block1 = RepackXMM<1>(a, b, c, d);
block2 = RepackXMM<2>(a, b, c, d);
block3 = RepackXMM<3>(a, b, c, d);
}
inline void SIMECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
{
// SIMECK requires a word swap for the decryption transform
__m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1));
__m128i x = _mm_shuffle_epi32(block1, _MM_SHUFFLE(2, 3, 0, 1));
__m128i y = _mm_shuffle_epi32(block2, _MM_SHUFFLE(2, 3, 0, 1));
__m128i z = _mm_shuffle_epi32(block3, _MM_SHUFFLE(2, 3, 0, 1));
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
__m128i a = UnpackXMM<0>(w, x, y, z);
__m128i b = UnpackXMM<1>(w, x, y, z);
__m128i c = UnpackXMM<2>(w, x, y, z);
__m128i d = UnpackXMM<3>(w, x, y, z);
const unsigned int rounds = 44;
for (int i = static_cast<int>(rounds)-1; i >= 0; i -= 4)
{
const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
}
// [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
w = RepackXMM<0>(a, b, c, d);
x = RepackXMM<1>(a, b, c, d);
y = RepackXMM<2>(a, b, c, d);
z = RepackXMM<3>(a, b, c, d);
block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
block1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
block2 = _mm_shuffle_epi32(y, _MM_SHUFFLE(2, 3, 0, 1));
block3 = _mm_shuffle_epi32(z, _MM_SHUFFLE(2, 3, 0, 1));
}
#endif // CRYPTOPP_SSSE3_AVAILABLE
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Enc_Block, SIMECK64_Enc_4_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Dec_Block, SIMECK64_Dec_4_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_SSSE3_AVAILABLE
NAMESPACE_END

121
simon.cpp
View File

@ -196,14 +196,6 @@ ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
extern size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
extern size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -212,14 +204,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, siz
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_SSE41_AVAILABLE)
extern size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_SSSE3_AVAILABLE)
extern size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -228,14 +212,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
extern size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
extern size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -246,39 +222,11 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys,
std::string SIMON64::Base::AlgorithmProvider() const
{
#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
# if (CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
return "SSE4.1";
# endif
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return "NEON";
# endif
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return "Altivec";
# endif
#endif
return "C++";
}
unsigned int SIMON64::Base::OptimalDataAlignment() const
{
#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
# if (CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
return 16; // load __m128i
# endif
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return 4; // load uint32x4_t
# endif
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return 16; // load uint32x4_p
# endif
#endif
return GetAlignmentOf<word32>();
}
@ -311,29 +259,6 @@ void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
default:
CRYPTOPP_ASSERT(0);
}
#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
// Pre-splat the round keys for Altivec forward transformation
#if CRYPTOPP_ALTIVEC_AVAILABLE
if (IsForwardTransformation() && HasAltivec())
{
AlignedSecBlock presplat(m_rkeys.size()*4);
for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
m_rkeys.swap(presplat);
}
#elif CRYPTOPP_SSE41_AVAILABLE
if (IsForwardTransformation() && HasSSE41())
{
AlignedSecBlock presplat(m_rkeys.size()*4);
for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
m_rkeys.swap(presplat);
}
#endif
#endif // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
}
void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -478,7 +403,7 @@ void SIMON128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength
}
#endif
#endif // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
#endif // CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
}
void SIMON128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -533,50 +458,6 @@ void SIMON128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
}
#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
#if (CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
return SIMON64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
#if (CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
return SIMON64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
#if (CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS)
size_t SIMON128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const

18
simon.h
View File

@ -17,14 +17,6 @@
#include "seckey.h"
#include "secblock.h"
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
# ifndef CRYPTOPP_DISABLE_SIMON_SIMD
# define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1
# endif
#endif
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
@ -36,13 +28,9 @@
// Yet another SunStudio/SunCC workaround. Failed self tests
// in SSE code paths on i386 for SunStudio 12.3 and below.
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
# undef CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
# undef CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
#endif
// https://github.com/weidai11/cryptopp/issues/945
#undef CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
NAMESPACE_BEGIN(CryptoPP)
/// \brief SIMON block cipher information
@ -129,9 +117,6 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
/// \brief SIMON64 decryption transformation
@ -142,9 +127,6 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;

View File

@ -1,864 +0,0 @@
// simon_simd.cpp - written and placed in the public domain by Jeffrey Walton
//
// This source file uses intrinsics and built-ins to gain access to
// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
// source file is needed because additional CXXFLAGS are required to enable
// the appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "simon.h"
#include "misc.h"
// Uncomment for benchmarking C++ against SSE or NEON.
// Do so in both simon.cpp and simon_simd.cpp.
// #undef CRYPTOPP_SSE41_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_SSE41_AVAILABLE)
# include "adv_simd.h"
# include <pmmintrin.h>
# include <tmmintrin.h>
# include <smmintrin.h>
#endif
#if defined(__XOP__)
# include <ammintrin.h>
# if defined(__GNUC__)
# include <x86intrin.h>
# endif
#endif
#if (CRYPTOPP_ARM_NEON_HEADER)
# include "adv_simd.h"
# include <arm_neon.h>
#endif
#if (CRYPTOPP_ARM_ACLE_HEADER)
# include <stdint.h>
# include <arm_acle.h>
#endif
#if defined(_M_ARM64)
# include "adv_simd.h"
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
# include "adv_simd.h"
# include "ppc_simd.h"
#endif
// Squash MS LNK4221 and libtool warnings
extern const char SIMON64_SIMD_FNAME[] = __FILE__;
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::byte;
using CryptoPP::word32;
using CryptoPP::word64;
using CryptoPP::vec_swap; // SunCC
// *************************** ARM NEON ************************** //
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
template <class T>
inline T UnpackHigh32(const T& a, const T& b)
{
const uint32x2_t x(vget_high_u32((uint32x4_t)a));
const uint32x2_t y(vget_high_u32((uint32x4_t)b));
const uint32x2x2_t r = vzip_u32(x, y);
return (T)vcombine_u32(r.val[0], r.val[1]);
}
template <class T>
inline T UnpackLow32(const T& a, const T& b)
{
const uint32x2_t x(vget_low_u32((uint32x4_t)a));
const uint32x2_t y(vget_low_u32((uint32x4_t)b));
const uint32x2x2_t r = vzip_u32(x, y);
return (T)vcombine_u32(r.val[0], r.val[1]);
}
template <unsigned int R>
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
{
const uint32x4_t a(vshlq_n_u32(val, R));
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
return vorrq_u32(a, b);
}
template <unsigned int R>
inline uint32x4_t RotateRight32(const uint32x4_t& val)
{
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
const uint32x4_t b(vshrq_n_u32(val, R));
return vorrq_u32(a, b);
}
#if defined(__aarch32__) || defined(__aarch64__)
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
{
const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
const uint8x16_t mask = vld1q_u8(maskb);
return vreinterpretq_u32_u8(
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
{
const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
const uint8x16_t mask = vld1q_u8(maskb);
return vreinterpretq_u32_u8(
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
}
#endif
inline uint32x4_t SIMON64_f(const uint32x4_t& val)
{
return veorq_u32(RotateLeft32<2>(val),
vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
}
inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
}
if (rounds & 1)
{
const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
std::swap(x1, y1);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow32(y1, x1);
block1 = UnpackHigh32(y1, x1);
}
inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
if (rounds & 1)
{
std::swap(x1, y1);
const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow32(y1, x1);
block1 = UnpackHigh32(y1, x1);
}
inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
{
const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
}
if (rounds & 1)
{
const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow32(y1, x1);
block1 = UnpackHigh32(y1, x1);
block2 = UnpackLow32(y2, x2);
block3 = UnpackHigh32(y2, x2);
block4 = UnpackLow32(y3, x3);
block5 = UnpackHigh32(y3, x3);
}
inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
if (rounds & 1)
{
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow32(y1, x1);
block1 = UnpackHigh32(y1, x1);
block2 = UnpackLow32(y2, x2);
block3 = UnpackHigh32(y2, x2);
block4 = UnpackLow32(y3, x3);
block5 = UnpackHigh32(y3, x3);
}
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
// ***************************** IA-32 ***************************** //
#if (CRYPTOPP_SSE41_AVAILABLE)
// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
#ifndef M128_CAST
# define M128_CAST(x) ((__m128i *)(void *)(x))
#endif
#ifndef CONST_M128_CAST
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
#endif
inline void Swap128(__m128i& a,__m128i& b)
{
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
// __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
// SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
vec_swap(a, b);
#else
std::swap(a, b);
#endif
}
template <unsigned int R>
inline __m128i RotateLeft32(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, R);
#else
return _mm_or_si128(
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
#endif
}
template <unsigned int R>
inline __m128i RotateRight32(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-R);
#else
return _mm_or_si128(
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
#endif
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
__m128i RotateLeft32<8>(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
return _mm_shuffle_epi8(val, mask);
#endif
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
__m128i RotateRight32<8>(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-8);
#else
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
return _mm_shuffle_epi8(val, mask);
#endif
}
inline __m128i SIMON64_f(const __m128i& v)
{
return _mm_xor_si128(RotateLeft32<2>(v),
_mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
}
inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
// Round keys are pre-splated in forward direction
const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
}
if (rounds & 1)
{
// Round keys are pre-splated in forward direction
const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
Swap128(x1, y1);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(y1, x1);
block1 = _mm_unpackhi_epi32(y1, x1);
}
inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
if (rounds & 1)
{
Swap128(x1, y1);
const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(y1, x1);
block1 = _mm_unpackhi_epi32(y1, x1);
}
inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
const __m128 t2 = _mm_castsi128_ps(block2);
const __m128 t3 = _mm_castsi128_ps(block3);
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
const __m128 t4 = _mm_castsi128_ps(block4);
const __m128 t5 = _mm_castsi128_ps(block5);
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
// Round keys are pre-splated in forward direction
const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
}
if (rounds & 1)
{
// Round keys are pre-splated in forward direction
const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(y1, x1);
block1 = _mm_unpackhi_epi32(y1, x1);
block2 = _mm_unpacklo_epi32(y2, x2);
block3 = _mm_unpackhi_epi32(y2, x2);
block4 = _mm_unpacklo_epi32(y3, x3);
block5 = _mm_unpackhi_epi32(y3, x3);
}
inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
const __m128 t2 = _mm_castsi128_ps(block2);
const __m128 t3 = _mm_castsi128_ps(block3);
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
const __m128 t4 = _mm_castsi128_ps(block4);
const __m128 t5 = _mm_castsi128_ps(block5);
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
if (rounds & 1)
{
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(y1, x1);
block1 = _mm_unpackhi_epi32(y1, x1);
block2 = _mm_unpacklo_epi32(y2, x2);
block3 = _mm_unpackhi_epi32(y2, x2);
block4 = _mm_unpacklo_epi32(y3, x3);
block5 = _mm_unpackhi_epi32(y3, x3);
}
#endif // CRYPTOPP_SSE41_AVAILABLE
// ***************************** Altivec ***************************** //
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::VecAnd;
using CryptoPP::VecXor;
using CryptoPP::VecLoad;
using CryptoPP::VecLoadAligned;
using CryptoPP::VecPermute;
// Rotate left by bit count
template<unsigned int C>
inline uint32x4_p RotateLeft32(const uint32x4_p val)
{
const uint32x4_p m = {C, C, C, C};
return vec_rl(val, m);
}
// Rotate right by bit count
template<unsigned int C>
inline uint32x4_p RotateRight32(const uint32x4_p val)
{
const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
return vec_rl(val, m);
}
inline uint32x4_p SIMON64_f(const uint32x4_p val)
{
return VecXor(RotateLeft32<2>(val),
VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
}
inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
const word32 *subkeys, unsigned int rounds)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
#else
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = VecPermute(block0, block1, m1);
uint32x4_p y1 = VecPermute(block0, block1, m2);
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
// Round keys are pre-splated in forward direction
const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4);
const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
}
if (rounds & 1)
{
// Round keys are pre-splated in forward direction
const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
std::swap(x1, y1);
}
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
#else
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
}
inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
const word32 *subkeys, unsigned int rounds)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
#else
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = VecPermute(block0, block1, m1);
uint32x4_p y1 = VecPermute(block0, block1, m2);
if (rounds & 1)
{
std::swap(x1, y1);
#if defined(_ARCH_PWR7)
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VecLoad(subkeys+rounds-1);
rk = VecPermute(rk, rk, m);
#endif
y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
#if defined(_ARCH_PWR7)
const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
const uint32x4_p rk2 = vec_splats(subkeys[i]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VecLoad(subkeys+i+1);
uint32x4_p rk2 = VecLoad(subkeys+i);
rk1 = VecPermute(rk1, rk1, m);
rk2 = VecPermute(rk2, rk2, m);
#endif
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
}
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
#else
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
}
inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
#else
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
// Round keys are pre-splated in forward direction
const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4);
const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
}
if (rounds & 1)
{
// Round keys are pre-splated in forward direction
const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
}
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
#else
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
#else
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
if (rounds & 1)
{
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
#if defined(_ARCH_PWR7)
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VecLoad(subkeys+rounds-1);
rk = VecPermute(rk, rk, m);
#endif
y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
#if defined(_ARCH_PWR7)
const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
const uint32x4_p rk2 = vec_splats(subkeys[i]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VecLoad(subkeys+i+1);
uint32x4_p rk2 = VecLoad(subkeys+i);
rk1 = VecPermute(rk1, rk1, m);
rk2 = VecPermute(rk2, rk2, m);
#endif
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
}
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
#else
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
ANONYMOUS_NAMESPACE_END
///////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(CryptoPP)
// *************************** ARM NEON **************************** //
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
// ***************************** IA-32 ***************************** //
#if (CRYPTOPP_SSE41_AVAILABLE)
size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif
// ***************************** Altivec ***************************** //
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif
NAMESPACE_END

109
speck.cpp
View File

@ -171,12 +171,6 @@ ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
extern size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -200,14 +194,6 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
extern size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
extern size_t SPECK128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -218,39 +204,11 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys,
std::string SPECK64::Base::AlgorithmProvider() const
{
#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
# if (CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
return "SSE4.1";
# endif
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return "NEON";
# endif
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return "Altivec";
# endif
#endif
return "C++";
}
unsigned int SPECK64::Base::OptimalDataAlignment() const
{
#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
# if (CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
return 16; // load __m128i
# endif
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return 4; // load uint32x4_t
# endif
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return 16; // load uint32x4_p
# endif
#endif
return GetAlignmentOf<word32>();
}
@ -283,29 +241,6 @@ void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
default:
CRYPTOPP_ASSERT(0);
}
#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
// Pre-splat the round keys for Altivec forward transformation
#if CRYPTOPP_ALTIVEC_AVAILABLE
if (IsForwardTransformation() && HasAltivec())
{
AlignedSecBlock presplat(m_rkeys.size()*4);
for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
m_rkeys.swap(presplat);
}
#elif CRYPTOPP_SSE41_AVAILABLE
if (IsForwardTransformation() && HasSSE41())
{
AlignedSecBlock presplat(m_rkeys.size()*4);
for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
m_rkeys.swap(presplat);
}
#endif
#endif // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
}
void SPECK64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -505,50 +440,6 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
}
#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
size_t SPECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
#if (CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
return SPECK64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return SPECK64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SPECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
#if (CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
return SPECK64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return SPECK64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
#if (CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS)
size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const

18
speck.h
View File

@ -17,14 +17,6 @@
#include "seckey.h"
#include "secblock.h"
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
# ifndef CRYPTOPP_DISABLE_SPECK_SIMD
# define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1
# endif
#endif
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
@ -36,13 +28,9 @@
// Yet another SunStudio/SunCC workaround. Failed self tests
// in SSE code paths on i386 for SunStudio 12.3 and below.
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
# undef CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
# undef CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS
#endif
// https://github.com/weidai11/cryptopp/issues/945
#undef CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
NAMESPACE_BEGIN(CryptoPP)
/// \brief SPECK block cipher information
@ -129,9 +117,6 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
/// \brief SPECK64 decryption transformation
@ -142,9 +127,6 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;

View File

@ -1,781 +0,0 @@
// speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
//
// This source file uses intrinsics and built-ins to gain access to
// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
// source file is needed because additional CXXFLAGS are required to enable
// the appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "speck.h"
#include "misc.h"
// Uncomment for benchmarking C++ against SSE or NEON.
// Do so in both speck.cpp and speck_simd.cpp.
// #undef CRYPTOPP_SSE41_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_SSE41_AVAILABLE)
# include "adv_simd.h"
# include <pmmintrin.h>
# include <tmmintrin.h>
# include <smmintrin.h>
#endif
#if defined(__XOP__)
# include <ammintrin.h>
# if defined(__GNUC__)
# include <x86intrin.h>
# endif
#endif
#if (CRYPTOPP_ARM_NEON_HEADER)
# include "adv_simd.h"
# include <arm_neon.h>
#endif
#if (CRYPTOPP_ARM_ACLE_HEADER)
# include <stdint.h>
# include <arm_acle.h>
#endif
#if defined(_M_ARM64)
# include "adv_simd.h"
#endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
# include "adv_simd.h"
# include "ppc_simd.h"
#endif
// Squash MS LNK4221 and libtool warnings
extern const char SPECK64_SIMD_FNAME[] = __FILE__;
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::byte;
using CryptoPP::word32;
using CryptoPP::word64;
// *************************** ARM NEON ************************** //
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
template <class T>
inline T UnpackHigh32(const T& a, const T& b)
{
const uint32x2_t x(vget_high_u32((uint32x4_t)a));
const uint32x2_t y(vget_high_u32((uint32x4_t)b));
const uint32x2x2_t r = vzip_u32(x, y);
return (T)vcombine_u32(r.val[0], r.val[1]);
}
template <class T>
inline T UnpackLow32(const T& a, const T& b)
{
const uint32x2_t x(vget_low_u32((uint32x4_t)a));
const uint32x2_t y(vget_low_u32((uint32x4_t)b));
const uint32x2x2_t r = vzip_u32(x, y);
return (T)vcombine_u32(r.val[0], r.val[1]);
}
template <unsigned int R>
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
{
const uint32x4_t a(vshlq_n_u32(val, R));
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
return vorrq_u32(a, b);
}
template <unsigned int R>
inline uint32x4_t RotateRight32(const uint32x4_t& val)
{
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
const uint32x4_t b(vshrq_n_u32(val, R));
return vorrq_u32(a, b);
}
#if defined(__aarch32__) || defined(__aarch64__)
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
{
const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
const uint8x16_t mask = vld1q_u8(maskb);
return vreinterpretq_u32_u8(
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
{
const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
const uint8x16_t mask = vld1q_u8(maskb);
return vreinterpretq_u32_u8(
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
}
#endif // Aarch32 or Aarch64
inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
{
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
x1 = RotateRight32<8>(x1);
x1 = vaddq_u32(x1, y1);
x1 = veorq_u32(x1, rk);
y1 = RotateLeft32<3>(y1);
y1 = veorq_u32(y1, x1);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow32(y1, x1);
block1 = UnpackHigh32(y1, x1);
}
inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
y1 = veorq_u32(y1, x1);
y1 = RotateRight32<3>(y1);
x1 = veorq_u32(x1, rk);
x1 = vsubq_u32(x1, y1);
x1 = RotateLeft32<8>(x1);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow32(y1, x1);
block1 = UnpackHigh32(y1, x1);
}
inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
{
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
x1 = RotateRight32<8>(x1);
x2 = RotateRight32<8>(x2);
x3 = RotateRight32<8>(x3);
x1 = vaddq_u32(x1, y1);
x2 = vaddq_u32(x2, y2);
x3 = vaddq_u32(x3, y3);
x1 = veorq_u32(x1, rk);
x2 = veorq_u32(x2, rk);
x3 = veorq_u32(x3, rk);
y1 = RotateLeft32<3>(y1);
y2 = RotateLeft32<3>(y2);
y3 = RotateLeft32<3>(y3);
y1 = veorq_u32(y1, x1);
y2 = veorq_u32(y2, x2);
y3 = veorq_u32(y3, x3);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow32(y1, x1);
block1 = UnpackHigh32(y1, x1);
block2 = UnpackLow32(y2, x2);
block3 = UnpackHigh32(y2, x2);
block4 = UnpackLow32(y3, x3);
block5 = UnpackHigh32(y3, x3);
}
inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
y1 = veorq_u32(y1, x1);
y2 = veorq_u32(y2, x2);
y3 = veorq_u32(y3, x3);
y1 = RotateRight32<3>(y1);
y2 = RotateRight32<3>(y2);
y3 = RotateRight32<3>(y3);
x1 = veorq_u32(x1, rk);
x2 = veorq_u32(x2, rk);
x3 = veorq_u32(x3, rk);
x1 = vsubq_u32(x1, y1);
x2 = vsubq_u32(x2, y2);
x3 = vsubq_u32(x3, y3);
x1 = RotateLeft32<8>(x1);
x2 = RotateLeft32<8>(x2);
x3 = RotateLeft32<8>(x3);
}
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow32(y1, x1);
block1 = UnpackHigh32(y1, x1);
block2 = UnpackLow32(y2, x2);
block3 = UnpackHigh32(y2, x2);
block4 = UnpackLow32(y3, x3);
block5 = UnpackHigh32(y3, x3);
}
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
// ***************************** IA-32 ***************************** //
#if (CRYPTOPP_SSE41_AVAILABLE)
#ifndef M128_CAST
# define M128_CAST(x) ((__m128i *)(void *)(x))
#endif
#ifndef CONST_M128_CAST
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
#endif
template <unsigned int R>
inline __m128i RotateLeft32(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, R);
#else
return _mm_or_si128(
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
#endif
}
template <unsigned int R>
inline __m128i RotateRight32(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-R);
#else
return _mm_or_si128(
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
#endif
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
__m128i RotateLeft32<8>(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
return _mm_shuffle_epi8(val, mask);
#endif
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
__m128i RotateRight32<8>(const __m128i& val)
{
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-8);
#else
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
return _mm_shuffle_epi8(val, mask);
#endif
}
inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
{
// Round keys are pre-splated in forward direction
const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
x1 = RotateRight32<8>(x1);
x1 = _mm_add_epi32(x1, y1);
x1 = _mm_xor_si128(x1, rk);
y1 = RotateLeft32<3>(y1);
y1 = _mm_xor_si128(y1, x1);
}
// The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(y1, x1);
block1 = _mm_unpackhi_epi32(y1, x1);
}
inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
const __m128i rk = _mm_set1_epi32(subkeys[i]);
y1 = _mm_xor_si128(y1, x1);
y1 = RotateRight32<3>(y1);
x1 = _mm_xor_si128(x1, rk);
x1 = _mm_sub_epi32(x1, y1);
x1 = RotateLeft32<8>(x1);
}
// The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(y1, x1);
block1 = _mm_unpackhi_epi32(y1, x1);
}
inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
const __m128 t2 = _mm_castsi128_ps(block2);
const __m128 t3 = _mm_castsi128_ps(block3);
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
const __m128 t4 = _mm_castsi128_ps(block4);
const __m128 t5 = _mm_castsi128_ps(block5);
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
{
// Round keys are pre-splated in forward direction
const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
x1 = RotateRight32<8>(x1);
x2 = RotateRight32<8>(x2);
x3 = RotateRight32<8>(x3);
x1 = _mm_add_epi32(x1, y1);
x2 = _mm_add_epi32(x2, y2);
x3 = _mm_add_epi32(x3, y3);
x1 = _mm_xor_si128(x1, rk);
x2 = _mm_xor_si128(x2, rk);
x3 = _mm_xor_si128(x3, rk);
y1 = RotateLeft32<3>(y1);
y2 = RotateLeft32<3>(y2);
y3 = RotateLeft32<3>(y3);
y1 = _mm_xor_si128(y1, x1);
y2 = _mm_xor_si128(y2, x2);
y3 = _mm_xor_si128(y3, x3);
}
// The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(y1, x1);
block1 = _mm_unpackhi_epi32(y1, x1);
block2 = _mm_unpacklo_epi32(y2, x2);
block3 = _mm_unpackhi_epi32(y2, x2);
block4 = _mm_unpacklo_epi32(y3, x3);
block5 = _mm_unpackhi_epi32(y3, x3);
}
inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word32 *subkeys, unsigned int rounds)
{
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
const __m128 t2 = _mm_castsi128_ps(block2);
const __m128 t3 = _mm_castsi128_ps(block3);
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
const __m128 t4 = _mm_castsi128_ps(block4);
const __m128 t5 = _mm_castsi128_ps(block5);
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
const __m128i rk = _mm_set1_epi32(subkeys[i]);
y1 = _mm_xor_si128(y1, x1);
y2 = _mm_xor_si128(y2, x2);
y3 = _mm_xor_si128(y3, x3);
y1 = RotateRight32<3>(y1);
y2 = RotateRight32<3>(y2);
y3 = RotateRight32<3>(y3);
x1 = _mm_xor_si128(x1, rk);
x2 = _mm_xor_si128(x2, rk);
x3 = _mm_xor_si128(x3, rk);
x1 = _mm_sub_epi32(x1, y1);
x2 = _mm_sub_epi32(x2, y2);
x3 = _mm_sub_epi32(x3, y3);
x1 = RotateLeft32<8>(x1);
x2 = RotateLeft32<8>(x2);
x3 = RotateLeft32<8>(x3);
}
// The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(y1, x1);
block1 = _mm_unpackhi_epi32(y1, x1);
block2 = _mm_unpacklo_epi32(y2, x2);
block3 = _mm_unpackhi_epi32(y2, x2);
block4 = _mm_unpacklo_epi32(y3, x3);
block5 = _mm_unpackhi_epi32(y3, x3);
}
#endif // CRYPTOPP_SSE41_AVAILABLE
// ***************************** Altivec ***************************** //
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::VecAdd;
using CryptoPP::VecSub;
using CryptoPP::VecXor;
using CryptoPP::VecLoad;
using CryptoPP::VecLoadAligned;
using CryptoPP::VecPermute;
// Rotate left by bit count
template<unsigned int C>
inline uint32x4_p RotateLeft32(const uint32x4_p val)
{
const uint32x4_p m = {C, C, C, C};
return vec_rl(val, m);
}
// Rotate right by bit count
template<unsigned int C>
inline uint32x4_p RotateRight32(const uint32x4_p val)
{
const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
return vec_rl(val, m);
}
void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
const word32 *subkeys, unsigned int rounds)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
#else
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = VecPermute(block0, block1, m1);
uint32x4_p y1 = VecPermute(block0, block1, m2);
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
{
// Round keys are pre-splated in forward direction
const uint32x4_p rk = VecLoadAligned(subkeys+i*4);
x1 = RotateRight32<8>(x1);
x1 = VecAdd(x1, y1);
x1 = VecXor(x1, rk);
y1 = RotateLeft32<3>(y1);
y1 = VecXor(y1, x1);
}
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
#else
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
}
void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
const word32 *subkeys, unsigned int rounds)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
#else
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = VecPermute(block0, block1, m1);
uint32x4_p y1 = VecPermute(block0, block1, m2);
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
#if defined(_ARCH_PWR7)
const uint32x4_p rk = vec_splats(subkeys[i]);
#else
// subkeys has extra elements so memory backs the last subkey
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VecLoad(subkeys+i);
rk = VecPermute(rk, rk, m);
#endif
y1 = VecXor(y1, x1);
y1 = RotateRight32<3>(y1);
x1 = VecXor(x1, rk);
x1 = VecSub(x1, y1);
x1 = RotateLeft32<8>(x1);
}
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
#else
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
}
void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
#else
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
{
// Round keys are pre-splated in forward direction
const uint32x4_p rk = VecLoadAligned(subkeys+i*4);
x1 = RotateRight32<8>(x1);
x2 = RotateRight32<8>(x2);
x3 = RotateRight32<8>(x3);
x1 = VecAdd(x1, y1);
x2 = VecAdd(x2, y2);
x3 = VecAdd(x3, y3);
x1 = VecXor(x1, rk);
x2 = VecXor(x2, rk);
x3 = VecXor(x3, rk);
y1 = RotateLeft32<3>(y1);
y2 = RotateLeft32<3>(y2);
y3 = RotateLeft32<3>(y3);
y1 = VecXor(y1, x1);
y2 = VecXor(y2, x2);
y3 = VecXor(y3, x3);
}
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
#else
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
#else
const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
#if defined(_ARCH_PWR7)
const uint32x4_p rk = vec_splats(subkeys[i]);
#else
// subkeys has extra elements so memory backs the last subkey
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VecLoad(subkeys+i);
rk = VecPermute(rk, rk, m);
#endif
y1 = VecXor(y1, x1);
y2 = VecXor(y2, x2);
y3 = VecXor(y3, x3);
y1 = RotateRight32<3>(y1);
y2 = RotateRight32<3>(y2);
y3 = RotateRight32<3>(y3);
x1 = VecXor(x1, rk);
x2 = VecXor(x2, rk);
x3 = VecXor(x3, rk);
x1 = VecSub(x1, y1);
x2 = VecSub(x2, y2);
x3 = VecSub(x3, y3);
x1 = RotateLeft32<8>(x1);
x2 = RotateLeft32<8>(x2);
x3 = RotateLeft32<8>(x3);
}
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
#else
const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
ANONYMOUS_NAMESPACE_END
///////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(CryptoPP)
// *************************** ARM NEON **************************** //
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif
// ***************************** IA-32 ***************************** //
#if (CRYPTOPP_SSE41_AVAILABLE)
size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif
// ***************************** Altivec ***************************** //
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif
NAMESPACE_END