mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 01:49:41 +00:00
Add Power8 AES decryption
This commit is contained in:
parent
cfb63decec
commit
b090e5f69f
@ -154,10 +154,11 @@ const byte Rijndael::Base::Sd[256] = {
|
|||||||
0x55, 0x21, 0x0c, 0x7d,
|
0x55, 0x21, 0x0c, 0x7d,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
|
||||||
const word32 Rijndael::Base::rcon[] = {
|
const word32 Rijndael::Base::rcon[] = {
|
||||||
0x01000000, 0x02000000, 0x04000000, 0x08000000,
|
0x01000000, 0x02000000, 0x04000000, 0x08000000,
|
||||||
0x10000000, 0x20000000, 0x40000000, 0x80000000,
|
0x10000000, 0x20000000, 0x40000000, 0x80000000,
|
||||||
0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
|
0x1B000000, 0x36000000
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END
|
NAMESPACE_END
|
||||||
|
@ -45,8 +45,9 @@
|
|||||||
|
|
||||||
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
|
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
|
||||||
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
|
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
|
||||||
// intrinsics without <arm_acle.h> included.
|
// intrinsics without <arm_acle.h> included. Also avoid it with GCC 4.8.
|
||||||
#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
|
#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) && \
|
||||||
|
(!defined(CRYPTOPP_GCC_VERSION) || (CRYPTOPP_GCC_VERSION >= 40900))
|
||||||
# include <arm_acle.h>
|
# include <arm_acle.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -158,6 +159,24 @@ bool CPU_ProbeAES()
|
|||||||
}
|
}
|
||||||
#endif // ARM32 or ARM64
|
#endif // ARM32 or ARM64
|
||||||
|
|
||||||
|
ANONYMOUS_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
CRYPTOPP_ALIGN_DATA(16)
|
||||||
|
const word32 s_one[] = {0, 0, 0, 1<<24};
|
||||||
|
|
||||||
|
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
|
||||||
|
CRYPTOPP_ALIGN_DATA(16)
|
||||||
|
const word32 s_rconLE[] = {
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
|
||||||
|
};
|
||||||
|
CRYPTOPP_ALIGN_DATA(16)
|
||||||
|
const word32 s_rconBE[] = {
|
||||||
|
0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000,
|
||||||
|
0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000
|
||||||
|
};
|
||||||
|
|
||||||
|
ANONYMOUS_NAMESPACE_END
|
||||||
|
|
||||||
// ***************************** ARMv8 ***************************** //
|
// ***************************** ARMv8 ***************************** //
|
||||||
|
|
||||||
#if (CRYPTOPP_ARM_AES_AVAILABLE)
|
#if (CRYPTOPP_ARM_AES_AVAILABLE)
|
||||||
@ -323,15 +342,6 @@ inline void ARMV8_Dec_4_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_
|
|||||||
block3 = veorq_u8(block3, vld1q_u8(keys+(i+1)*16));
|
block3 = veorq_u8(block3, vld1q_u8(keys+(i+1)*16));
|
||||||
}
|
}
|
||||||
|
|
||||||
const word32 s_one[] = {0, 0, 0, 1<<24};
|
|
||||||
|
|
||||||
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
|
|
||||||
const word32 rcon[] = {
|
|
||||||
0x01, 0x02, 0x04, 0x08,
|
|
||||||
0x10, 0x20, 0x40, 0x80,
|
|
||||||
0x1B, 0x36
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename F1, typename F4>
|
template <typename F1, typename F4>
|
||||||
size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
|
size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
|
||||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
@ -537,9 +547,6 @@ inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2
|
|||||||
block3 = _mm_aesdeclast_si128(block3, rk);
|
block3 = _mm_aesdeclast_si128(block3, rk);
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_ALIGN_DATA(16)
|
|
||||||
static const word32 s_one[] = {0, 0, 0, 1<<24};
|
|
||||||
|
|
||||||
template <typename F1, typename F4>
|
template <typename F1, typename F4>
|
||||||
inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
|
inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
|
||||||
MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
|
MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
|
||||||
@ -680,16 +687,9 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t ro
|
|||||||
sk, rounds, ib, xb, outBlocks, length, flags);
|
sk, rounds, ib, xb, outBlocks, length, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk)
|
void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk, unsigned int rounds)
|
||||||
{
|
{
|
||||||
const unsigned rounds = static_cast<unsigned int>(keyLen/4 + 6);
|
const word32 *ro = s_rconLE, *rc = s_rconLE;
|
||||||
static const word32 rcLE[] = {
|
|
||||||
0x01, 0x02, 0x04, 0x08,
|
|
||||||
0x10, 0x20, 0x40, 0x80,
|
|
||||||
0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
|
|
||||||
};
|
|
||||||
|
|
||||||
const word32 *ro = rcLE, *rc = rcLE;
|
|
||||||
CRYPTOPP_UNUSED(ro);
|
CRYPTOPP_UNUSED(ro);
|
||||||
|
|
||||||
__m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16));
|
__m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16));
|
||||||
@ -700,7 +700,7 @@ void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, wor
|
|||||||
const word32* end = rk + keySize;
|
const word32* end = rk + keySize;
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE));
|
CRYPTOPP_ASSERT(rc < ro + COUNTOF(s_rconLE));
|
||||||
rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
|
rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
|
||||||
rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
|
rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
|
||||||
rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
|
rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
|
||||||
@ -1011,17 +1011,21 @@ void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds
|
|||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
VectorType s = VectorLoad(inBlock);
|
VectorType s = VectorLoad(inBlock);
|
||||||
VectorType k = VectorLoadAligned(keys);
|
VectorType k = VectorLoadAligned(rounds*16, keys);
|
||||||
|
|
||||||
s = VectorXor(s, k);
|
s = VectorXor(s, k);
|
||||||
for (size_t i=1; i<rounds-1; i+=2)
|
for (size_t i=rounds-1; i>1; i-=2)
|
||||||
{
|
{
|
||||||
s = VectorDecrypt(s, VectorLoadAligned( i*16, keys));
|
s = VectorDecrypt(s, VectorLoadAligned( i*16, keys));
|
||||||
s = VectorDecrypt(s, VectorLoadAligned((i+1)*16, keys));
|
s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys));
|
||||||
}
|
}
|
||||||
|
|
||||||
s = VectorDecrypt(s, VectorLoadAligned((rounds-1)*16, keys));
|
s = VectorDecrypt(s, VectorLoadAligned(16, keys));
|
||||||
s = VectorDecryptLast(s, VectorLoadAligned(rounds*16, keys));
|
s = VectorDecryptLast(s, VectorLoadAligned(0, keys));
|
||||||
|
|
||||||
|
// According to benchmarks this is a tad bit slower
|
||||||
|
// if (xorBlock)
|
||||||
|
// s = VectorXor(s, VectorLoad(xorBlock));
|
||||||
|
|
||||||
VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
|
VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
|
||||||
s = VectorXor(s, x);
|
s = VectorXor(s, x);
|
||||||
|
54
rijndael.cpp
54
rijndael.cpp
@ -113,6 +113,19 @@ CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
|
|||||||
|
|
||||||
static volatile bool s_TeFilled = false, s_TdFilled = false;
|
static volatile bool s_TeFilled = false, s_TdFilled = false;
|
||||||
|
|
||||||
|
ANONYMOUS_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
CRYPTOPP_ALIGN_DATA(16)
|
||||||
|
const word32 s_one[] = {0, 0, 0, 1<<24};
|
||||||
|
|
||||||
|
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
|
||||||
|
CRYPTOPP_ALIGN_DATA(16)
|
||||||
|
const word32 s_rconLE[] = {
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
|
||||||
|
};
|
||||||
|
|
||||||
|
ANONYMOUS_NAMESPACE_END
|
||||||
|
|
||||||
// ************************* Portable Code ************************************
|
// ************************* Portable Code ************************************
|
||||||
|
|
||||||
#define QUARTER_ROUND(L, T, t, a, b, c, d) \
|
#define QUARTER_ROUND(L, T, t, a, b, c, d) \
|
||||||
@ -221,7 +234,7 @@ void Rijndael::Base::FillDecTable()
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_AESNI_AVAILABLE)
|
#if (CRYPTOPP_AESNI_AVAILABLE)
|
||||||
extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
|
extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk, unsigned int rounds);
|
||||||
extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
|
extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
|
||||||
|
|
||||||
extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
|
extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
|
||||||
@ -240,8 +253,6 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
|
|||||||
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
|
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
|
||||||
extern void ByteReverseArrayLE(byte src[16]);
|
extern void ByteReverseArrayLE(byte src[16]);
|
||||||
|
|
||||||
extern void Rijndael_UncheckedSetKey_POWER8(const byte *userKey, size_t keyLen, word32 *rk, CipherDir dir);
|
|
||||||
|
|
||||||
extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
|
extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
|
||||||
const byte *inBlock, const byte *xorBlock, byte *outBlock);
|
const byte *inBlock, const byte *xorBlock, byte *outBlock);
|
||||||
extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
|
extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
|
||||||
@ -263,7 +274,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
|
|||||||
{
|
{
|
||||||
// TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
|
// TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
|
||||||
// Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
|
// Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
|
||||||
Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
|
Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk, m_rounds);
|
||||||
if (!IsForwardTransformation())
|
if (!IsForwardTransformation())
|
||||||
Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
|
Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
|
||||||
|
|
||||||
@ -306,6 +317,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
|
|||||||
|
|
||||||
rk = m_key;
|
rk = m_key;
|
||||||
|
|
||||||
|
#if CRYPTOPP_POWER8_AES_AVAILABLE
|
||||||
|
if (HasAES())
|
||||||
|
{
|
||||||
|
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
|
||||||
|
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
|
||||||
|
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
|
||||||
|
|
||||||
|
#if defined(IS_LITTLE_ENDIAN)
|
||||||
|
// VSX registers are big-endian. The entire subkey table must be byte
|
||||||
|
// reversed on little-endian systems to ensure it loads properly.
|
||||||
|
byte * ptr = reinterpret_cast<byte*>(rk);
|
||||||
|
for (unsigned int i=0; i<=m_rounds; i++)
|
||||||
|
ByteReverseArrayLE(ptr+i*16);
|
||||||
|
#endif // IS_LITTLE_ENDIAN
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif // CRYPTOPP_POWER8_AES_AVAILABLE
|
||||||
|
|
||||||
if (IsForwardTransformation())
|
if (IsForwardTransformation())
|
||||||
{
|
{
|
||||||
if (!s_TeFilled)
|
if (!s_TeFilled)
|
||||||
@ -351,20 +381,6 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
|
|||||||
if (HasAES())
|
if (HasAES())
|
||||||
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
|
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
|
||||||
#endif
|
#endif
|
||||||
#if CRYPTOPP_POWER8_AES_AVAILABLE
|
|
||||||
if (IsForwardTransformation() && HasAES())
|
|
||||||
{
|
|
||||||
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
|
|
||||||
|
|
||||||
#if defined(IS_LITTLE_ENDIAN)
|
|
||||||
// VSX registers are big-endian. The entire subkey table must be byte
|
|
||||||
// reversed on little-endian systems to ensure it loads properly.
|
|
||||||
byte * ptr = reinterpret_cast<byte*>(rk);
|
|
||||||
for (unsigned int i=0; i<=m_rounds; i++)
|
|
||||||
ByteReverseArrayLE(ptr+i*16);
|
|
||||||
#endif // IS_LITTLE_ENDIAN
|
|
||||||
}
|
|
||||||
#endif // CRYPTOPP_POWER8_AES_AVAILABLE
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||||
@ -483,7 +499,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (CRYPTOPP_POWER8_AES_AVAILABLE) && 0
|
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
|
||||||
if (HasAES())
|
if (HasAES())
|
||||||
{
|
{
|
||||||
(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
|
(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
|
||||||
|
Loading…
Reference in New Issue
Block a user