Remove CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS

GCC 8 was producing bad decryption results for CBC mode on x86. NEON and Aarch64 was fine. We lose 0.6 cpb so LEA runs around 3.5 cpb instead of 2.9 cpb. It would be nice to pinpoint the GCC issue but it is kind of difficult at the moment.
This commit is contained in:
Jeffrey Walton 2018-06-23 20:55:17 -04:00
parent 3af3b8019b
commit 404e6cfae3
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 3 additions and 57 deletions

View File

@ -140,11 +140,7 @@ uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
template <unsigned int IDX>
inline uint32x4_t LoadKey(const word32 rkey[])
{
#if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS)
return vld1q_u32(&rkey[IDX*4]);
#else
return vdupq_n_u32(rkey[IDX]);
#endif
}
template <unsigned int IDX>
@ -599,7 +595,8 @@ inline __m128i RotateRight<8>(const __m128i& val)
template <unsigned int IDX>
inline __m128i LoadKey(const word32 rkey[])
{
return _mm_loadu_si128((const __m128i*) &rkey[IDX*4]);
float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk));
return _mm_castps_si128(_mm_load_ps1(&rk));
}
template <unsigned int IDX>
@ -989,17 +986,6 @@ ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
void LEA_SplatKeys_SSSE3(SecBlock<word32>& rkeys)
{
SecBlock<word32> temp(rkeys.size() * 4);
for (size_t i=0, j=0; i<rkeys.size(); i++, j+=4)
{
_mm_storeu_si128((__m128i*) &temp[j],
_mm_castps_si128(_mm_load_ps1((const float*) &rkeys[i])));
}
std::swap(rkeys, temp);
}
size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
@ -1016,18 +1002,6 @@ size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
#endif // CRYPTOPP_SSSE3_AVAILABLE
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS)
void LEA_SplatKeys_NEON(SecBlock<word32>& rkeys)
{
SecBlock<word32> temp(rkeys.size() * 4);
for (size_t i=0, j=0; i<rkeys.size(); i++, j+=4)
{
vst1q_u32(&temp[j], vdupq_n_u32(rkeys[i]));
}
std::swap(rkeys, temp);
}
# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS
size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{

20
lea.cpp
View File

@ -557,8 +557,6 @@ NAMESPACE_BEGIN(CryptoPP)
#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
# if defined(CRYPTOPP_SSSE3_AVAILABLE)
extern void LEA_SplatKeys_SSSE3(SecBlock<word32>& rkeys);
extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -567,10 +565,6 @@ extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t
# endif
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS)
extern void LEA_SplatKeys_NEON(SecBlock<word32>& rkeys);
# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS
extern size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -609,20 +603,6 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con
default:
CRYPTOPP_ASSERT(0);;
}
// If we pre-splat the round keys at setup then we avoid a shuffle
// at runtime for each subkey used during encryption and decryption.
// Pre-splatting saves about 0.7 to 1.0 cpb at the cost of 4x storage.
#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3())
LEA_SplatKeys_SSSE3(m_rkey);
#endif
#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_ARM_NEON_AVAILABLE)
# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS)
if (HasNEON())
LEA_SplatKeys_NEON(m_rkey);
# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS
#endif
}
void LEA::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const

10
lea.h
View File

@ -19,19 +19,11 @@
# define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1
#endif
// Define this if you want to pre-splat the round key table
// for NEON and Aarch64. Pre-splatting the round key increases
// performance by about 0.7 cpb on ARM server boards like an
// AMD Opteron A1100. However, it crushes performance on ARM
// dev-boards like LeMaker HiKey and Pine64. HiKey and Pine64
// run about 8 cpb slower when pre-splatting the round keys.
// # define CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS 1
NAMESPACE_BEGIN(CryptoPP)
/// \brief LEA block cipher information
/// \since Crypto++ 7.1
struct LEA_Info : public FixedBlockSize<16>, VariableKeyLength<16,16,32,8>
struct LEA_Info : public FixedBlockSize<16>, public VariableKeyLength<16,16,32,8>
{
static const std::string StaticAlgorithmName()
{