Cleanup PPC vector functions

The Crypto++ functions follow IBM's lead and provide VectorLoad, VectorLoadBE, VectorStore, and VectorStoreBE. Additionally, VectorLoadKey was removed in favor of vanilla VectorLoad.
This commit is contained in:
Jeffrey Walton 2018-08-06 05:15:12 -04:00
parent 9c27143522
commit 6cd7f83346
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 122 additions and 161 deletions

View File

@ -1849,44 +1849,44 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
if (flags & BT_InBlockIsCounter)
{
block0 = VectorLoad(inBlocks);
block0 = VectorLoadBE(inBlocks);
block1 = VectorAdd(block0, s_one);
block2 = VectorAdd(block1, s_one);
block3 = VectorAdd(block2, s_one);
block4 = VectorAdd(block3, s_one);
block5 = VectorAdd(block4, s_one);
temp = VectorAdd(block5, s_one);
VectorStore(temp, const_cast<byte*>(inBlocks));
VectorStoreBE(temp, const_cast<byte*>(inBlocks));
}
else
{
block0 = VectorLoad(inBlocks);
block0 = VectorLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block1 = VectorLoad(inBlocks);
block1 = VectorLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block2 = VectorLoad(inBlocks);
block2 = VectorLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block3 = VectorLoad(inBlocks);
block3 = VectorLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block4 = VectorLoad(inBlocks);
block4 = VectorLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block5 = VectorLoad(inBlocks);
block5 = VectorLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
}
if (xorInput)
{
block0 = VectorXor(block0, VectorLoad(xorBlocks));
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoad(xorBlocks));
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block2 = VectorXor(block2, VectorLoad(xorBlocks));
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block3 = VectorXor(block3, VectorLoad(xorBlocks));
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block4 = VectorXor(block4, VectorLoad(xorBlocks));
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block5 = VectorXor(block5, VectorLoad(xorBlocks));
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
@ -1894,31 +1894,31 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
if (xorOutput)
{
block0 = VectorXor(block0, VectorLoad(xorBlocks));
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoad(xorBlocks));
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block2 = VectorXor(block2, VectorLoad(xorBlocks));
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block3 = VectorXor(block3, VectorLoad(xorBlocks));
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block4 = VectorXor(block4, VectorLoad(xorBlocks));
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block5 = VectorXor(block5, VectorLoad(xorBlocks));
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
VectorStore(block0, outBlocks);
VectorStoreBE(block0, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStore(block1, outBlocks);
VectorStoreBE(block1, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStore(block2, outBlocks);
VectorStoreBE(block2, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStore(block3, outBlocks);
VectorStoreBE(block3, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStore(block4, outBlocks);
VectorStoreBE(block4, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStore(block5, outBlocks);
VectorStoreBE(block5, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
length -= 6*blockSize;
@ -1927,10 +1927,10 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
while (length >= blockSize)
{
uint32x4_p block = VectorLoad(inBlocks);
uint32x4_p block = VectorLoadBE(inBlocks);
if (xorInput)
block = VectorXor(block, VectorLoad(xorBlocks));
block = VectorXor(block, VectorLoadBE(xorBlocks));
if (flags & BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
@ -1938,9 +1938,9 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
func1(block, subKeys, rounds);
if (xorOutput)
block = VectorXor(block, VectorLoad(xorBlocks));
block = VectorXor(block, VectorLoadBE(xorBlocks));
VectorStore(block, outBlocks);
VectorStoreBE(block, outBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
outBlocks = PtrAdd(outBlocks, outIncrement);

View File

@ -32,11 +32,11 @@
# undef CRYPTOPP_POWER7_AVAILABLE
#endif
#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(_CRYPTO))
#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(__CRYPTO) || defined(__CRYPTO__))
# undef CRYPTOPP_POWER8_AVAILABLE
# undef CRYPTOPP_POWER8_AES_AVAILABLE
# undef CRYPTOPP_POWER8_SHA_AVAILABLE
# undef CRYPTOPP_POWER8_PMULL_AVAILABLE
# undef CRYPTOPP_POWER8_SHA_AVAILABLE
#endif
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
@ -65,7 +65,7 @@ typedef __vector unsigned long long uint64x2_p;
/// \tparam T vector type
/// \param src the vector
/// \details Reverse() endian swaps the bytes in a vector
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
/// \since Crypto++ 6.0
template <class T>
inline T Reverse(const T& src)
@ -151,33 +151,45 @@ inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2)
#endif
}
/// \brief Shift two vectors right
/// \tparam C shift byte count
/// \tparam T1 vector type
/// \tparam T2 vector type
/// \param vec1 the first vector
/// \param vec2 the second vector
/// \details VectorShiftRight() concatenates vec1 and vec2 and returns a
/// new vector after shifting the concatenation by the specified number
/// of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return
/// vector is the same type as vec1.
/// \details On big endian machines VectorShiftRight() is <tt>vec_sld(a, b,
/// c)</tt>. On little endian machines VectorShiftRight() is translated to
/// <tt>vec_sld(b, a, 16-c)</tt>. You should always call the function as
/// if on a big endian machine as shown below.
/// <pre>
/// uint8x16_p r0 = {0};
/// uint8x16_p r1 = VectorLoad(ptr);
/// uint8x16_p r5 = VectorShiftRight<12>(r0, r1);
/// </pre>
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
/// endian sensitive?</A> on Stack Overflow
/// \since Crypto++ 6.0
template <unsigned int C, class T1, class T2>
inline T1 VectorShiftRight(const T1& vec1, const T2& vec2)
{
return VectorShiftLeft<16-C>(vec1, vec2);
}
#endif // POWER4 and above
// POWER7/POWER4 load and store
#if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
/// \brief Reverse a 16-byte array
/// \param src the byte array
/// \details ReverseByteArrayLE reverses a 16-byte array on a little endian
/// system. It does nothing on a big endian system.
/// \since Crypto++ 6.0
inline void ReverseByteArrayLE(byte src[16])
{
#if defined(CRYPTOPP_XLC_VERSION) && defined(CRYPTOPP_LITTLE_ENDIAN)
vec_st(vec_reve(vec_ld(0, src)), 0, src);
#elif defined(CRYPTOPP_LITTLE_ENDIAN)
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
const uint8x16_p zero = {0};
vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, src);
#endif
}
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \details Loads a vector in big endian format from a byte array.
/// VectorLoadBE will swap endianess on little endian systems.
/// \note VectorLoadBE() does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoadBE(const uint8_t src[16])
{
@ -198,7 +210,7 @@ inline uint32x4_p VectorLoadBE(const uint8_t src[16])
/// \details Loads a vector in big endian format from a byte array.
/// VectorLoadBE will swap endianess on little endian systems.
/// \note VectorLoadBE does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16])
{
@ -215,70 +227,27 @@ inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16])
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \details Loads a vector in big endian format from a byte array.
/// VectorLoad will swap endianess on little endian systems.
/// \details Loads a vector in native endian format from a byte array.
/// \note VectorLoad does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoad(const byte src[16])
{
return (uint32x4_p)VectorLoadBE(src);
#if defined(CRYPTOPP_XLC_VERSION)
return (uint32x4_p)vec_xl(0, (byte*)src);
#else
return (uint32x4_p)vec_vsx_ld(0, src);
#endif
}
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \param off offset into the src byte array
/// \details Loads a vector in big endian format from a byte array.
/// VectorLoad will swap endianess on little endian systems.
/// \details Loads a vector in native endian format from a byte array.
/// \note VectorLoad does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoad(int off, const byte src[16])
{
return (uint32x4_p)VectorLoadBE(off, src);
}
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \details Loads a vector from a byte array.
/// VectorLoadKey does not swap endianess on little endian systems.
/// \note VectorLoadKey does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoadKey(const byte src[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
return (uint32x4_p)vec_xl(0, (byte*)src);
#else
return (uint32x4_p)vec_vsx_ld(0, src);
#endif
}
/// \brief Loads a vector from a 32-bit word array
/// \param src the 32-bit word array
/// \details Loads a vector from a 32-bit word array.
/// VectorLoadKey does not swap endianess on little endian systems.
/// \note VectorLoadKey does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoadKey(const word32 src[4])
{
#if defined(CRYPTOPP_XLC_VERSION)
return (uint32x4_p)vec_xl(0, (byte*)src);
#else
return (uint32x4_p)vec_vsx_ld(0, src);
#endif
}
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \param off offset into the src byte array
/// \details Loads a vector from a byte array.
/// VectorLoadKey does not swap endianess on little endian systems.
/// \note VectorLoadKey does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoadKey(int off, const byte src[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
return (uint32x4_p)vec_xl(off, (byte*)src);
@ -294,7 +263,7 @@ inline uint32x4_p VectorLoadKey(int off, const byte src[16])
/// \details Stores a vector in big endian format to a byte array.
/// VectorStoreBE will swap endianess on little endian systems.
/// \note VectorStoreBE does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
/// \since Crypto++ 6.0
template <class T>
inline void VectorStoreBE(const T& src, uint8_t dest[16])
@ -318,7 +287,7 @@ inline void VectorStoreBE(const T& src, uint8_t dest[16])
/// \details Stores a vector in big endian format to a byte array.
/// VectorStoreBE will swap endianess on little endian systems.
/// \note VectorStoreBE does not require an aligned array.
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
/// \since Crypto++ 6.0
template <class T>
inline void VectorStoreBE(const T& src, int off, uint8_t dest[16])
@ -338,8 +307,7 @@ inline void VectorStoreBE(const T& src, int off, uint8_t dest[16])
/// \tparam T vector type
/// \param src the vector
/// \param dest the byte array
/// \details Stores a vector in big endian format to a byte array.
/// VectorStore will swap endianess on little endian systems.
/// \details Stores a vector in native endian format to a byte array.
/// \note VectorStore does not require an aligned array.
/// \since Crypto++ 6.0
template<class T>
@ -347,13 +315,9 @@ inline void VectorStore(const T& src, byte dest[16])
{
// Do not call VectorStoreBE. It slows us down by about 0.5 cpb on LE.
#if defined(CRYPTOPP_XLC_VERSION)
vec_xst_be((uint8x16_p)src, 0, dest);
vec_xst((uint8x16_p)src, 0, dest);
#else
# if defined(CRYPTOPP_LITTLE_ENDIAN)
vec_vsx_st(Reverse((uint8x16_p)src), 0, dest);
# else
vec_vsx_st((uint8x16_p)src, 0, dest);
# endif
#endif
}
@ -362,8 +326,7 @@ inline void VectorStore(const T& src, byte dest[16])
/// \param src the vector
/// \param off offset into the dest byte array
/// \param dest the byte array
/// \details Stores a vector in big endian format to a byte array.
/// VectorStore will swap endianess on little endian systems.
/// \details Stores a vector in native endian format to a byte array.
/// \note VectorStore does not require an aligned array.
/// \since Crypto++ 6.0
template<class T>
@ -371,13 +334,9 @@ inline void VectorStore(const T& src, int off, byte dest[16])
{
// Do not call VectorStoreBE. It slows us down by about 0.5 cpb on LE.
#if defined(CRYPTOPP_XLC_VERSION)
vec_xst_be((uint8x16_p)src, off, dest);
vec_xst((uint8x16_p)src, off, dest);
#else
# if defined(CRYPTOPP_LITTLE_ENDIAN)
vec_vsx_st(Reverse((uint8x16_p)src), off, dest);
# else
vec_vsx_st((uint8x16_p)src, off, dest);
# endif
#endif
}

View File

@ -697,17 +697,17 @@ static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, un
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
uint32x4_p k = VectorLoadKey(keys);
uint32x4_p k = VectorLoad(keys);
block = VectorXor(block, k);
for (size_t i=1; i<rounds-1; i+=2)
{
block = VectorEncrypt(block, VectorLoadKey( i*16, keys));
block = VectorEncrypt(block, VectorLoadKey((i+1)*16, keys));
block = VectorEncrypt(block, VectorLoad( i*16, keys));
block = VectorEncrypt(block, VectorLoad((i+1)*16, keys));
}
block = VectorEncrypt(block, VectorLoadKey((rounds-1)*16, keys));
block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
block = VectorEncrypt(block, VectorLoad((rounds-1)*16, keys));
block = VectorEncryptLast(block, VectorLoad(rounds*16, keys));
}
static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -717,7 +717,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
uint32x4_p k = VectorLoadKey(keys);
uint32x4_p k = VectorLoad(keys);
block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k);
@ -727,7 +727,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
for (size_t i=1; i<rounds; ++i)
{
k = VectorLoadKey(i*16, keys);
k = VectorLoad(i*16, keys);
block0 = VectorEncrypt(block0, k);
block1 = VectorEncrypt(block1, k);
block2 = VectorEncrypt(block2, k);
@ -736,7 +736,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
block5 = VectorEncrypt(block5, k);
}
k = VectorLoadKey(rounds*16, keys);
k = VectorLoad(rounds*16, keys);
block0 = VectorEncryptLast(block0, k);
block1 = VectorEncryptLast(block1, k);
block2 = VectorEncryptLast(block2, k);
@ -750,17 +750,17 @@ static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, un
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
uint32x4_p k = VectorLoadKey(rounds*16, keys);
uint32x4_p k = VectorLoad(rounds*16, keys);
block = VectorXor(block, k);
for (size_t i=rounds-1; i>1; i-=2)
{
block = VectorDecrypt(block, VectorLoadKey( i*16, keys));
block = VectorDecrypt(block, VectorLoadKey((i-1)*16, keys));
block = VectorDecrypt(block, VectorLoad( i*16, keys));
block = VectorDecrypt(block, VectorLoad((i-1)*16, keys));
}
block = VectorDecrypt(block, VectorLoadKey(16, keys));
block = VectorDecryptLast(block, VectorLoadKey(0, keys));
block = VectorDecrypt(block, VectorLoad(16, keys));
block = VectorDecryptLast(block, VectorLoad(0, keys));
}
static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -770,7 +770,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
uint32x4_p k = VectorLoadKey(rounds*16, keys);
uint32x4_p k = VectorLoad(rounds*16, keys);
block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k);
@ -780,7 +780,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
for (size_t i=rounds-1; i>0; --i)
{
k = VectorLoadKey(i*16, keys);
k = VectorLoad(i*16, keys);
block0 = VectorDecrypt(block0, k);
block1 = VectorDecrypt(block1, k);
block2 = VectorDecrypt(block2, k);
@ -789,7 +789,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
block5 = VectorDecrypt(block5, k);
}
k = VectorLoadKey(0, keys);
k = VectorLoad(0, keys);
block0 = VectorDecryptLast(block0, k);
block1 = VectorDecryptLast(block1, k);
block2 = VectorDecryptLast(block2, k);
@ -804,60 +804,62 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32*
{
const size_t rounds = keyLen / 4 + 6;
const word32 *rc = s_rconBE;
word32 *rkey = rk, temp;
GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
word32 *rk_saved = rk, temp; // unused in big-endian
CRYPTOPP_UNUSED(rk_saved);
GetUserKey(BIG_ENDIAN_ORDER, rkey, keyLen/4, userKey, keyLen);
// keySize: m_key allocates 4*(rounds+1) word32's.
const size_t keySize = 4*(rounds+1);
const word32* end = rk + keySize;
const word32* end = rkey + keySize;
while (true)
{
temp = rk[keyLen/4-1];
temp = rkey[keyLen/4-1];
word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
(word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
rkey[keyLen/4] = rkey[0] ^ x ^ *(rc++);
rkey[keyLen/4+1] = rkey[1] ^ rkey[keyLen/4];
rkey[keyLen/4+2] = rkey[2] ^ rkey[keyLen/4+1];
rkey[keyLen/4+3] = rkey[3] ^ rkey[keyLen/4+2];
if (rk + keyLen/4 + 4 == end)
if (rkey + keyLen/4 + 4 == end)
break;
if (keyLen == 24)
{
rk[10] = rk[ 4] ^ rk[ 9];
rk[11] = rk[ 5] ^ rk[10];
rkey[10] = rkey[ 4] ^ rkey[ 9];
rkey[11] = rkey[ 5] ^ rkey[10];
}
else if (keyLen == 32)
{
temp = rk[11];
rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
rk[13] = rk[ 5] ^ rk[12];
rk[14] = rk[ 6] ^ rk[13];
rk[15] = rk[ 7] ^ rk[14];
temp = rkey[11];
rkey[12] = rkey[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
rkey[13] = rkey[ 5] ^ rkey[12];
rkey[14] = rkey[ 6] ^ rkey[13];
rkey[15] = rkey[ 7] ^ rkey[14];
}
rk += keyLen/4;
rkey += keyLen/4;
}
#if defined(CRYPTOPP_LITTLE_ENDIAN)
rk = rk_saved;
rkey = rk;
const uint8x16_p mask = ((uint8x16_p){12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3});
const uint8x16_p zero = {0};
unsigned int i=0;
for (i=0; i<rounds; i+=2, rk+=8)
for (i=0; i<rounds; i+=2, rkey+=8)
{
const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rk);
const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rk);
vec_vsx_st(vec_perm(d1, zero, mask), 0, (uint8_t*)rk);
vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rk);
const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rkey);
const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rkey);
vec_vsx_st(vec_perm(d1, zero, mask), 0, (uint8_t*)rkey);
vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rkey);
}
for ( ; i<rounds+1; i++, rk+=4)
vec_vsx_st(vec_perm(vec_vsx_ld(0, (uint8_t*)rk), zero, mask), 0, (uint8_t*)rk);
for ( ; i<rounds+1; i++, rkey+=4)
{
const uint8x16_p d = vec_vsx_ld( 0, (uint8_t*)rkey);
vec_vsx_st(vec_perm(d, zero, mask), 0, (uint8_t*)rkey);
}
#endif
}