Add Power8 AES decryption

This commit is contained in:
Jeffrey Walton 2017-09-12 05:53:17 -04:00
parent cfb63decec
commit b090e5f69f
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 70 additions and 49 deletions

View File

@ -154,10 +154,11 @@ const byte Rijndael::Base::Sd[256] = {
0x55, 0x21, 0x0c, 0x7d,
};
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
const word32 Rijndael::Base::rcon[] = {
0x01000000, 0x02000000, 0x04000000, 0x08000000,
0x10000000, 0x20000000, 0x40000000, 0x80000000,
0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
0x1B000000, 0x36000000
};
NAMESPACE_END

View File

@ -45,8 +45,9 @@
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
// intrinsics without <arm_acle.h> included.
#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
// intrinsics without <arm_acle.h> included. Also avoid it with GCC 4.8.
#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) && \
(!defined(CRYPTOPP_GCC_VERSION) || (CRYPTOPP_GCC_VERSION >= 40900))
# include <arm_acle.h>
#endif
@ -158,6 +159,24 @@ bool CPU_ProbeAES()
}
#endif // ARM32 or ARM64
ANONYMOUS_NAMESPACE_BEGIN
CRYPTOPP_ALIGN_DATA(16)
const word32 s_one[] = {0, 0, 0, 1<<24};
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
CRYPTOPP_ALIGN_DATA(16)
const word32 s_rconLE[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
};
CRYPTOPP_ALIGN_DATA(16)
const word32 s_rconBE[] = {
0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000,
0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000
};
ANONYMOUS_NAMESPACE_END
// ***************************** ARMv8 ***************************** //
#if (CRYPTOPP_ARM_AES_AVAILABLE)
@ -323,15 +342,6 @@ inline void ARMV8_Dec_4_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_
block3 = veorq_u8(block3, vld1q_u8(keys+(i+1)*16));
}
const word32 s_one[] = {0, 0, 0, 1<<24};
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
const word32 rcon[] = {
0x01, 0x02, 0x04, 0x08,
0x10, 0x20, 0x40, 0x80,
0x1B, 0x36
};
template <typename F1, typename F4>
size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
@ -537,9 +547,6 @@ inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2
block3 = _mm_aesdeclast_si128(block3, rk);
}
CRYPTOPP_ALIGN_DATA(16)
static const word32 s_one[] = {0, 0, 0, 1<<24};
template <typename F1, typename F4>
inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
@ -680,16 +687,9 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t ro
sk, rounds, ib, xb, outBlocks, length, flags);
}
void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk)
void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk, unsigned int rounds)
{
const unsigned rounds = static_cast<unsigned int>(keyLen/4 + 6);
static const word32 rcLE[] = {
0x01, 0x02, 0x04, 0x08,
0x10, 0x20, 0x40, 0x80,
0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
};
const word32 *ro = rcLE, *rc = rcLE;
const word32 *ro = s_rconLE, *rc = s_rconLE;
CRYPTOPP_UNUSED(ro);
__m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16));
@ -700,7 +700,7 @@ void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, wor
const word32* end = rk + keySize;
while (true)
{
CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE));
CRYPTOPP_ASSERT(rc < ro + COUNTOF(s_rconLE));
rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
@ -1011,17 +1011,21 @@ void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType s = VectorLoad(inBlock);
VectorType k = VectorLoadAligned(keys);
VectorType k = VectorLoadAligned(rounds*16, keys);
s = VectorXor(s, k);
for (size_t i=1; i<rounds-1; i+=2)
for (size_t i=rounds-1; i>1; i-=2)
{
s = VectorDecrypt(s, VectorLoadAligned( i*16, keys));
s = VectorDecrypt(s, VectorLoadAligned((i+1)*16, keys));
s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys));
}
s = VectorDecrypt(s, VectorLoadAligned((rounds-1)*16, keys));
s = VectorDecryptLast(s, VectorLoadAligned(rounds*16, keys));
s = VectorDecrypt(s, VectorLoadAligned(16, keys));
s = VectorDecryptLast(s, VectorLoadAligned(0, keys));
// According to benchmarks this is a tad bit slower
// if (xorBlock)
// s = VectorXor(s, VectorLoad(xorBlock));
VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
s = VectorXor(s, x);

View File

@ -113,6 +113,19 @@ CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
static volatile bool s_TeFilled = false, s_TdFilled = false;
ANONYMOUS_NAMESPACE_BEGIN
CRYPTOPP_ALIGN_DATA(16)
const word32 s_one[] = {0, 0, 0, 1<<24};
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
CRYPTOPP_ALIGN_DATA(16)
const word32 s_rconLE[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
};
ANONYMOUS_NAMESPACE_END
// ************************* Portable Code ************************************
#define QUARTER_ROUND(L, T, t, a, b, c, d) \
@ -221,7 +234,7 @@ void Rijndael::Base::FillDecTable()
}
#if (CRYPTOPP_AESNI_AVAILABLE)
extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk, unsigned int rounds);
extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
@ -240,8 +253,6 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
extern void ByteReverseArrayLE(byte src[16]);
extern void Rijndael_UncheckedSetKey_POWER8(const byte *userKey, size_t keyLen, word32 *rk, CipherDir dir);
extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlock, const byte *xorBlock, byte *outBlock);
extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
@ -263,7 +274,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
{
// TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
// Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk, m_rounds);
if (!IsForwardTransformation())
Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
@ -306,6 +317,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
rk = m_key;
#if CRYPTOPP_POWER8_AES_AVAILABLE
if (HasAES())
{
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
#if defined(IS_LITTLE_ENDIAN)
// VSX registers are big-endian. The entire subkey table must be byte
// reversed on little-endian systems to ensure it loads properly.
byte * ptr = reinterpret_cast<byte*>(rk);
for (unsigned int i=0; i<=m_rounds; i++)
ByteReverseArrayLE(ptr+i*16);
#endif // IS_LITTLE_ENDIAN
return;
}
#endif // CRYPTOPP_POWER8_AES_AVAILABLE
if (IsForwardTransformation())
{
if (!s_TeFilled)
@ -351,20 +381,6 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
if (HasAES())
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
#endif
#if CRYPTOPP_POWER8_AES_AVAILABLE
if (IsForwardTransformation() && HasAES())
{
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
#if defined(IS_LITTLE_ENDIAN)
// VSX registers are big-endian. The entire subkey table must be byte
// reversed on little-endian systems to ensure it loads properly.
byte * ptr = reinterpret_cast<byte*>(rk);
for (unsigned int i=0; i<=m_rounds; i++)
ByteReverseArrayLE(ptr+i*16);
#endif // IS_LITTLE_ENDIAN
}
#endif // CRYPTOPP_POWER8_AES_AVAILABLE
}
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -483,7 +499,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
}
#endif
#if (CRYPTOPP_POWER8_AES_AVAILABLE) && 0
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
if (HasAES())
{
(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);