mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 09:59:42 +00:00
Remove CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS
GCC 8 was producing bad decryption results for CBC mode on x86. NEON and Aarch64 was fine. We lose 0.6 cpb so LEA runs around 3.5 cpb instead of 2.9 cpb. It would be nice to pinpoint the GCC issue but it is kind of difficult at the moment.
This commit is contained in:
parent
3af3b8019b
commit
404e6cfae3
30
lea-simd.cpp
30
lea-simd.cpp
@ -140,11 +140,7 @@ uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
|
|||||||
template <unsigned int IDX>
|
template <unsigned int IDX>
|
||||||
inline uint32x4_t LoadKey(const word32 rkey[])
|
inline uint32x4_t LoadKey(const word32 rkey[])
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS)
|
|
||||||
return vld1q_u32(&rkey[IDX*4]);
|
|
||||||
#else
|
|
||||||
return vdupq_n_u32(rkey[IDX]);
|
return vdupq_n_u32(rkey[IDX]);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int IDX>
|
template <unsigned int IDX>
|
||||||
@ -599,7 +595,8 @@ inline __m128i RotateRight<8>(const __m128i& val)
|
|||||||
template <unsigned int IDX>
|
template <unsigned int IDX>
|
||||||
inline __m128i LoadKey(const word32 rkey[])
|
inline __m128i LoadKey(const word32 rkey[])
|
||||||
{
|
{
|
||||||
return _mm_loadu_si128((const __m128i*) &rkey[IDX*4]);
|
float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk));
|
||||||
|
return _mm_castps_si128(_mm_load_ps1(&rk));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int IDX>
|
template <unsigned int IDX>
|
||||||
@ -989,17 +986,6 @@ ANONYMOUS_NAMESPACE_END
|
|||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
void LEA_SplatKeys_SSSE3(SecBlock<word32>& rkeys)
|
|
||||||
{
|
|
||||||
SecBlock<word32> temp(rkeys.size() * 4);
|
|
||||||
for (size_t i=0, j=0; i<rkeys.size(); i++, j+=4)
|
|
||||||
{
|
|
||||||
_mm_storeu_si128((__m128i*) &temp[j],
|
|
||||||
_mm_castps_si128(_mm_load_ps1((const float*) &rkeys[i])));
|
|
||||||
}
|
|
||||||
std::swap(rkeys, temp);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
{
|
{
|
||||||
@ -1016,18 +1002,6 @@ size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
|||||||
#endif // CRYPTOPP_SSSE3_AVAILABLE
|
#endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||||
|
|
||||||
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS)
|
|
||||||
void LEA_SplatKeys_NEON(SecBlock<word32>& rkeys)
|
|
||||||
{
|
|
||||||
SecBlock<word32> temp(rkeys.size() * 4);
|
|
||||||
for (size_t i=0, j=0; i<rkeys.size(); i++, j+=4)
|
|
||||||
{
|
|
||||||
vst1q_u32(&temp[j], vdupq_n_u32(rkeys[i]));
|
|
||||||
}
|
|
||||||
std::swap(rkeys, temp);
|
|
||||||
}
|
|
||||||
# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS
|
|
||||||
|
|
||||||
size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
{
|
{
|
||||||
|
20
lea.cpp
20
lea.cpp
@ -557,8 +557,6 @@ NAMESPACE_BEGIN(CryptoPP)
|
|||||||
|
|
||||||
#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
|
#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
|
||||||
# if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
# if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
extern void LEA_SplatKeys_SSSE3(SecBlock<word32>& rkeys);
|
|
||||||
|
|
||||||
extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||||
|
|
||||||
@ -567,10 +565,6 @@ extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t
|
|||||||
# endif
|
# endif
|
||||||
|
|
||||||
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS)
|
|
||||||
extern void LEA_SplatKeys_NEON(SecBlock<word32>& rkeys);
|
|
||||||
# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS
|
|
||||||
|
|
||||||
extern size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
extern size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||||
|
|
||||||
@ -609,20 +603,6 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con
|
|||||||
default:
|
default:
|
||||||
CRYPTOPP_ASSERT(0);;
|
CRYPTOPP_ASSERT(0);;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we pre-splat the round keys at setup then we avoid a shuffle
|
|
||||||
// at runtime for each subkey used during encryption and decryption.
|
|
||||||
// Pre-splatting saves about 0.7 to 1.0 cpb at the cost of 4x storage.
|
|
||||||
#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_SSSE3_AVAILABLE)
|
|
||||||
if (HasSSSE3())
|
|
||||||
LEA_SplatKeys_SSSE3(m_rkey);
|
|
||||||
#endif
|
|
||||||
#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_ARM_NEON_AVAILABLE)
|
|
||||||
# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS)
|
|
||||||
if (HasNEON())
|
|
||||||
LEA_SplatKeys_NEON(m_rkey);
|
|
||||||
# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void LEA::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
void LEA::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||||
|
10
lea.h
10
lea.h
@ -19,19 +19,11 @@
|
|||||||
# define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1
|
# define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Define this if you want to pre-splat the round key table
|
|
||||||
// for NEON and Aarch64. Pre-splatting the round key increases
|
|
||||||
// performance by about 0.7 cpb on ARM server boards like an
|
|
||||||
// AMD Opteron A1100. However, it crushes performance on ARM
|
|
||||||
// dev-boards like LeMaker HiKey and Pine64. HiKey and Pine64
|
|
||||||
// run about 8 cpb slower when pre-splatting the round keys.
|
|
||||||
// # define CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS 1
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
/// \brief LEA block cipher information
|
/// \brief LEA block cipher information
|
||||||
/// \since Crypto++ 7.1
|
/// \since Crypto++ 7.1
|
||||||
struct LEA_Info : public FixedBlockSize<16>, VariableKeyLength<16,16,32,8>
|
struct LEA_Info : public FixedBlockSize<16>, public VariableKeyLength<16,16,32,8>
|
||||||
{
|
{
|
||||||
static const std::string StaticAlgorithmName()
|
static const std::string StaticAlgorithmName()
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user