mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-26 19:30:21 +00:00
Cleanup PPC vector functions
The Crypto++ functions follow IBM's lead and provide VectorLoad, VectorLoadBE, VectorStore, and VectorStoreBE. Additionally, VectorLoadKey was removed in favor of vanilla VectorLoad.
This commit is contained in:
parent
9c27143522
commit
6cd7f83346
60
adv-simd.h
60
adv-simd.h
@ -1849,44 +1849,44 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
||||
|
||||
if (flags & BT_InBlockIsCounter)
|
||||
{
|
||||
block0 = VectorLoad(inBlocks);
|
||||
block0 = VectorLoadBE(inBlocks);
|
||||
block1 = VectorAdd(block0, s_one);
|
||||
block2 = VectorAdd(block1, s_one);
|
||||
block3 = VectorAdd(block2, s_one);
|
||||
block4 = VectorAdd(block3, s_one);
|
||||
block5 = VectorAdd(block4, s_one);
|
||||
temp = VectorAdd(block5, s_one);
|
||||
VectorStore(temp, const_cast<byte*>(inBlocks));
|
||||
VectorStoreBE(temp, const_cast<byte*>(inBlocks));
|
||||
}
|
||||
else
|
||||
{
|
||||
block0 = VectorLoad(inBlocks);
|
||||
block0 = VectorLoadBE(inBlocks);
|
||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||
block1 = VectorLoad(inBlocks);
|
||||
block1 = VectorLoadBE(inBlocks);
|
||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||
block2 = VectorLoad(inBlocks);
|
||||
block2 = VectorLoadBE(inBlocks);
|
||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||
block3 = VectorLoad(inBlocks);
|
||||
block3 = VectorLoadBE(inBlocks);
|
||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||
block4 = VectorLoad(inBlocks);
|
||||
block4 = VectorLoadBE(inBlocks);
|
||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||
block5 = VectorLoad(inBlocks);
|
||||
block5 = VectorLoadBE(inBlocks);
|
||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||
}
|
||||
|
||||
if (xorInput)
|
||||
{
|
||||
block0 = VectorXor(block0, VectorLoad(xorBlocks));
|
||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block1 = VectorXor(block1, VectorLoad(xorBlocks));
|
||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block2 = VectorXor(block2, VectorLoad(xorBlocks));
|
||||
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block3 = VectorXor(block3, VectorLoad(xorBlocks));
|
||||
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block4 = VectorXor(block4, VectorLoad(xorBlocks));
|
||||
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block5 = VectorXor(block5, VectorLoad(xorBlocks));
|
||||
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
}
|
||||
|
||||
@ -1894,31 +1894,31 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
||||
|
||||
if (xorOutput)
|
||||
{
|
||||
block0 = VectorXor(block0, VectorLoad(xorBlocks));
|
||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block1 = VectorXor(block1, VectorLoad(xorBlocks));
|
||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block2 = VectorXor(block2, VectorLoad(xorBlocks));
|
||||
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block3 = VectorXor(block3, VectorLoad(xorBlocks));
|
||||
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block4 = VectorXor(block4, VectorLoad(xorBlocks));
|
||||
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
block5 = VectorXor(block5, VectorLoad(xorBlocks));
|
||||
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
|
||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||
}
|
||||
|
||||
VectorStore(block0, outBlocks);
|
||||
VectorStoreBE(block0, outBlocks);
|
||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||
VectorStore(block1, outBlocks);
|
||||
VectorStoreBE(block1, outBlocks);
|
||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||
VectorStore(block2, outBlocks);
|
||||
VectorStoreBE(block2, outBlocks);
|
||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||
VectorStore(block3, outBlocks);
|
||||
VectorStoreBE(block3, outBlocks);
|
||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||
VectorStore(block4, outBlocks);
|
||||
VectorStoreBE(block4, outBlocks);
|
||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||
VectorStore(block5, outBlocks);
|
||||
VectorStoreBE(block5, outBlocks);
|
||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||
|
||||
length -= 6*blockSize;
|
||||
@ -1927,10 +1927,10 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
||||
|
||||
while (length >= blockSize)
|
||||
{
|
||||
uint32x4_p block = VectorLoad(inBlocks);
|
||||
uint32x4_p block = VectorLoadBE(inBlocks);
|
||||
|
||||
if (xorInput)
|
||||
block = VectorXor(block, VectorLoad(xorBlocks));
|
||||
block = VectorXor(block, VectorLoadBE(xorBlocks));
|
||||
|
||||
if (flags & BT_InBlockIsCounter)
|
||||
const_cast<byte *>(inBlocks)[15]++;
|
||||
@ -1938,9 +1938,9 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
||||
func1(block, subKeys, rounds);
|
||||
|
||||
if (xorOutput)
|
||||
block = VectorXor(block, VectorLoad(xorBlocks));
|
||||
block = VectorXor(block, VectorLoadBE(xorBlocks));
|
||||
|
||||
VectorStore(block, outBlocks);
|
||||
VectorStoreBE(block, outBlocks);
|
||||
|
||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||
|
137
ppc-simd.h
137
ppc-simd.h
@ -32,11 +32,11 @@
|
||||
# undef CRYPTOPP_POWER7_AVAILABLE
|
||||
#endif
|
||||
|
||||
#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(_CRYPTO))
|
||||
#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(__CRYPTO) || defined(__CRYPTO__))
|
||||
# undef CRYPTOPP_POWER8_AVAILABLE
|
||||
# undef CRYPTOPP_POWER8_AES_AVAILABLE
|
||||
# undef CRYPTOPP_POWER8_SHA_AVAILABLE
|
||||
# undef CRYPTOPP_POWER8_PMULL_AVAILABLE
|
||||
# undef CRYPTOPP_POWER8_SHA_AVAILABLE
|
||||
#endif
|
||||
|
||||
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
||||
@ -65,7 +65,7 @@ typedef __vector unsigned long long uint64x2_p;
|
||||
/// \tparam T vector type
|
||||
/// \param src the vector
|
||||
/// \details Reverse() endian swaps the bytes in a vector
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
|
||||
/// \since Crypto++ 6.0
|
||||
template <class T>
|
||||
inline T Reverse(const T& src)
|
||||
@ -151,33 +151,45 @@ inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2)
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Shift two vectors right
|
||||
/// \tparam C shift byte count
|
||||
/// \tparam T1 vector type
|
||||
/// \tparam T2 vector type
|
||||
/// \param vec1 the first vector
|
||||
/// \param vec2 the second vector
|
||||
/// \details VectorShiftRight() concatenates vec1 and vec2 and returns a
|
||||
/// new vector after shifting the concatenation by the specified number
|
||||
/// of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return
|
||||
/// vector is the same type as vec1.
|
||||
/// \details On big endian machines VectorShiftRight() is <tt>vec_sld(a, b,
|
||||
/// c)</tt>. On little endian machines VectorShiftRight() is translated to
|
||||
/// <tt>vec_sld(b, a, 16-c)</tt>. You should always call the function as
|
||||
/// if on a big endian machine as shown below.
|
||||
/// <pre>
|
||||
/// uint8x16_p r0 = {0};
|
||||
/// uint8x16_p r1 = VectorLoad(ptr);
|
||||
/// uint8x16_p r5 = VectorShiftRight<12>(r0, r1);
|
||||
/// </pre>
|
||||
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
||||
/// endian sensitive?</A> on Stack Overflow
|
||||
/// \since Crypto++ 6.0
|
||||
template <unsigned int C, class T1, class T2>
|
||||
inline T1 VectorShiftRight(const T1& vec1, const T2& vec2)
|
||||
{
|
||||
return VectorShiftLeft<16-C>(vec1, vec2);
|
||||
}
|
||||
|
||||
#endif // POWER4 and above
|
||||
|
||||
// POWER7/POWER4 load and store
|
||||
#if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
||||
|
||||
/// \brief Reverse a 16-byte array
|
||||
/// \param src the byte array
|
||||
/// \details ReverseByteArrayLE reverses a 16-byte array on a little endian
|
||||
/// system. It does nothing on a big endian system.
|
||||
/// \since Crypto++ 6.0
|
||||
inline void ReverseByteArrayLE(byte src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION) && defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
vec_st(vec_reve(vec_ld(0, src)), 0, src);
|
||||
#elif defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||
const uint8x16_p zero = {0};
|
||||
vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, src);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Loads a vector from a byte array
|
||||
/// \param src the byte array
|
||||
/// \details Loads a vector in big endian format from a byte array.
|
||||
/// VectorLoadBE will swap endianess on little endian systems.
|
||||
/// \note VectorLoadBE() does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
|
||||
/// \since Crypto++ 6.0
|
||||
inline uint32x4_p VectorLoadBE(const uint8_t src[16])
|
||||
{
|
||||
@ -198,7 +210,7 @@ inline uint32x4_p VectorLoadBE(const uint8_t src[16])
|
||||
/// \details Loads a vector in big endian format from a byte array.
|
||||
/// VectorLoadBE will swap endianess on little endian systems.
|
||||
/// \note VectorLoadBE does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
|
||||
/// \since Crypto++ 6.0
|
||||
inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16])
|
||||
{
|
||||
@ -215,70 +227,27 @@ inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16])
|
||||
|
||||
/// \brief Loads a vector from a byte array
|
||||
/// \param src the byte array
|
||||
/// \details Loads a vector in big endian format from a byte array.
|
||||
/// VectorLoad will swap endianess on little endian systems.
|
||||
/// \details Loads a vector in native endian format from a byte array.
|
||||
/// \note VectorLoad does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
|
||||
/// \since Crypto++ 6.0
|
||||
inline uint32x4_p VectorLoad(const byte src[16])
|
||||
{
|
||||
return (uint32x4_p)VectorLoadBE(src);
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
return (uint32x4_p)vec_xl(0, (byte*)src);
|
||||
#else
|
||||
return (uint32x4_p)vec_vsx_ld(0, src);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Loads a vector from a byte array
|
||||
/// \param src the byte array
|
||||
/// \param off offset into the src byte array
|
||||
/// \details Loads a vector in big endian format from a byte array.
|
||||
/// VectorLoad will swap endianess on little endian systems.
|
||||
/// \details Loads a vector in native endian format from a byte array.
|
||||
/// \note VectorLoad does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
|
||||
/// \since Crypto++ 6.0
|
||||
inline uint32x4_p VectorLoad(int off, const byte src[16])
|
||||
{
|
||||
return (uint32x4_p)VectorLoadBE(off, src);
|
||||
}
|
||||
|
||||
/// \brief Loads a vector from a byte array
|
||||
/// \param src the byte array
|
||||
/// \details Loads a vector from a byte array.
|
||||
/// VectorLoadKey does not swap endianess on little endian systems.
|
||||
/// \note VectorLoadKey does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \since Crypto++ 6.0
|
||||
inline uint32x4_p VectorLoadKey(const byte src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
return (uint32x4_p)vec_xl(0, (byte*)src);
|
||||
#else
|
||||
return (uint32x4_p)vec_vsx_ld(0, src);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Loads a vector from a 32-bit word array
|
||||
/// \param src the 32-bit word array
|
||||
/// \details Loads a vector from a 32-bit word array.
|
||||
/// VectorLoadKey does not swap endianess on little endian systems.
|
||||
/// \note VectorLoadKey does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \since Crypto++ 6.0
|
||||
inline uint32x4_p VectorLoadKey(const word32 src[4])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
return (uint32x4_p)vec_xl(0, (byte*)src);
|
||||
#else
|
||||
return (uint32x4_p)vec_vsx_ld(0, src);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Loads a vector from a byte array
|
||||
/// \param src the byte array
|
||||
/// \param off offset into the src byte array
|
||||
/// \details Loads a vector from a byte array.
|
||||
/// VectorLoadKey does not swap endianess on little endian systems.
|
||||
/// \note VectorLoadKey does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \since Crypto++ 6.0
|
||||
inline uint32x4_p VectorLoadKey(int off, const byte src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
return (uint32x4_p)vec_xl(off, (byte*)src);
|
||||
@ -294,7 +263,7 @@ inline uint32x4_p VectorLoadKey(int off, const byte src[16])
|
||||
/// \details Stores a vector in big endian format to a byte array.
|
||||
/// VectorStoreBE will swap endianess on little endian systems.
|
||||
/// \note VectorStoreBE does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
|
||||
/// \since Crypto++ 6.0
|
||||
template <class T>
|
||||
inline void VectorStoreBE(const T& src, uint8_t dest[16])
|
||||
@ -318,7 +287,7 @@ inline void VectorStoreBE(const T& src, uint8_t dest[16])
|
||||
/// \details Stores a vector in big endian format to a byte array.
|
||||
/// VectorStoreBE will swap endianess on little endian systems.
|
||||
/// \note VectorStoreBE does not require an aligned array.
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
|
||||
/// \sa Reverse(), VectorLoadBE(), VectorLoad()
|
||||
/// \since Crypto++ 6.0
|
||||
template <class T>
|
||||
inline void VectorStoreBE(const T& src, int off, uint8_t dest[16])
|
||||
@ -338,8 +307,7 @@ inline void VectorStoreBE(const T& src, int off, uint8_t dest[16])
|
||||
/// \tparam T vector type
|
||||
/// \param src the vector
|
||||
/// \param dest the byte array
|
||||
/// \details Stores a vector in big endian format to a byte array.
|
||||
/// VectorStore will swap endianess on little endian systems.
|
||||
/// \details Stores a vector in native endian format to a byte array.
|
||||
/// \note VectorStore does not require an aligned array.
|
||||
/// \since Crypto++ 6.0
|
||||
template<class T>
|
||||
@ -347,13 +315,9 @@ inline void VectorStore(const T& src, byte dest[16])
|
||||
{
|
||||
// Do not call VectorStoreBE. It slows us down by about 0.5 cpb on LE.
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
vec_xst_be((uint8x16_p)src, 0, dest);
|
||||
vec_xst((uint8x16_p)src, 0, dest);
|
||||
#else
|
||||
# if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
vec_vsx_st(Reverse((uint8x16_p)src), 0, dest);
|
||||
# else
|
||||
vec_vsx_st((uint8x16_p)src, 0, dest);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -362,8 +326,7 @@ inline void VectorStore(const T& src, byte dest[16])
|
||||
/// \param src the vector
|
||||
/// \param off offset into the dest byte array
|
||||
/// \param dest the byte array
|
||||
/// \details Stores a vector in big endian format to a byte array.
|
||||
/// VectorStore will swap endianess on little endian systems.
|
||||
/// \details Stores a vector in native endian format to a byte array.
|
||||
/// \note VectorStore does not require an aligned array.
|
||||
/// \since Crypto++ 6.0
|
||||
template<class T>
|
||||
@ -371,13 +334,9 @@ inline void VectorStore(const T& src, int off, byte dest[16])
|
||||
{
|
||||
// Do not call VectorStoreBE. It slows us down by about 0.5 cpb on LE.
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
vec_xst_be((uint8x16_p)src, off, dest);
|
||||
vec_xst((uint8x16_p)src, off, dest);
|
||||
#else
|
||||
# if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
vec_vsx_st(Reverse((uint8x16_p)src), off, dest);
|
||||
# else
|
||||
vec_vsx_st((uint8x16_p)src, off, dest);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -697,17 +697,17 @@ static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, un
|
||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||
|
||||
uint32x4_p k = VectorLoadKey(keys);
|
||||
uint32x4_p k = VectorLoad(keys);
|
||||
block = VectorXor(block, k);
|
||||
|
||||
for (size_t i=1; i<rounds-1; i+=2)
|
||||
{
|
||||
block = VectorEncrypt(block, VectorLoadKey( i*16, keys));
|
||||
block = VectorEncrypt(block, VectorLoadKey((i+1)*16, keys));
|
||||
block = VectorEncrypt(block, VectorLoad( i*16, keys));
|
||||
block = VectorEncrypt(block, VectorLoad((i+1)*16, keys));
|
||||
}
|
||||
|
||||
block = VectorEncrypt(block, VectorLoadKey((rounds-1)*16, keys));
|
||||
block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
|
||||
block = VectorEncrypt(block, VectorLoad((rounds-1)*16, keys));
|
||||
block = VectorEncryptLast(block, VectorLoad(rounds*16, keys));
|
||||
}
|
||||
|
||||
static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
@ -717,7 +717,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||
|
||||
uint32x4_p k = VectorLoadKey(keys);
|
||||
uint32x4_p k = VectorLoad(keys);
|
||||
block0 = VectorXor(block0, k);
|
||||
block1 = VectorXor(block1, k);
|
||||
block2 = VectorXor(block2, k);
|
||||
@ -727,7 +727,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
|
||||
for (size_t i=1; i<rounds; ++i)
|
||||
{
|
||||
k = VectorLoadKey(i*16, keys);
|
||||
k = VectorLoad(i*16, keys);
|
||||
block0 = VectorEncrypt(block0, k);
|
||||
block1 = VectorEncrypt(block1, k);
|
||||
block2 = VectorEncrypt(block2, k);
|
||||
@ -736,7 +736,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
block5 = VectorEncrypt(block5, k);
|
||||
}
|
||||
|
||||
k = VectorLoadKey(rounds*16, keys);
|
||||
k = VectorLoad(rounds*16, keys);
|
||||
block0 = VectorEncryptLast(block0, k);
|
||||
block1 = VectorEncryptLast(block1, k);
|
||||
block2 = VectorEncryptLast(block2, k);
|
||||
@ -750,17 +750,17 @@ static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, un
|
||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||
|
||||
uint32x4_p k = VectorLoadKey(rounds*16, keys);
|
||||
uint32x4_p k = VectorLoad(rounds*16, keys);
|
||||
block = VectorXor(block, k);
|
||||
|
||||
for (size_t i=rounds-1; i>1; i-=2)
|
||||
{
|
||||
block = VectorDecrypt(block, VectorLoadKey( i*16, keys));
|
||||
block = VectorDecrypt(block, VectorLoadKey((i-1)*16, keys));
|
||||
block = VectorDecrypt(block, VectorLoad( i*16, keys));
|
||||
block = VectorDecrypt(block, VectorLoad((i-1)*16, keys));
|
||||
}
|
||||
|
||||
block = VectorDecrypt(block, VectorLoadKey(16, keys));
|
||||
block = VectorDecryptLast(block, VectorLoadKey(0, keys));
|
||||
block = VectorDecrypt(block, VectorLoad(16, keys));
|
||||
block = VectorDecryptLast(block, VectorLoad(0, keys));
|
||||
}
|
||||
|
||||
static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
@ -770,7 +770,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||
|
||||
uint32x4_p k = VectorLoadKey(rounds*16, keys);
|
||||
uint32x4_p k = VectorLoad(rounds*16, keys);
|
||||
block0 = VectorXor(block0, k);
|
||||
block1 = VectorXor(block1, k);
|
||||
block2 = VectorXor(block2, k);
|
||||
@ -780,7 +780,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
|
||||
for (size_t i=rounds-1; i>0; --i)
|
||||
{
|
||||
k = VectorLoadKey(i*16, keys);
|
||||
k = VectorLoad(i*16, keys);
|
||||
block0 = VectorDecrypt(block0, k);
|
||||
block1 = VectorDecrypt(block1, k);
|
||||
block2 = VectorDecrypt(block2, k);
|
||||
@ -789,7 +789,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
block5 = VectorDecrypt(block5, k);
|
||||
}
|
||||
|
||||
k = VectorLoadKey(0, keys);
|
||||
k = VectorLoad(0, keys);
|
||||
block0 = VectorDecryptLast(block0, k);
|
||||
block1 = VectorDecryptLast(block1, k);
|
||||
block2 = VectorDecryptLast(block2, k);
|
||||
@ -804,60 +804,62 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32*
|
||||
{
|
||||
const size_t rounds = keyLen / 4 + 6;
|
||||
const word32 *rc = s_rconBE;
|
||||
word32 *rkey = rk, temp;
|
||||
|
||||
GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
|
||||
word32 *rk_saved = rk, temp; // unused in big-endian
|
||||
CRYPTOPP_UNUSED(rk_saved);
|
||||
GetUserKey(BIG_ENDIAN_ORDER, rkey, keyLen/4, userKey, keyLen);
|
||||
|
||||
// keySize: m_key allocates 4*(rounds+1) word32's.
|
||||
const size_t keySize = 4*(rounds+1);
|
||||
const word32* end = rk + keySize;
|
||||
const word32* end = rkey + keySize;
|
||||
|
||||
while (true)
|
||||
{
|
||||
temp = rk[keyLen/4-1];
|
||||
temp = rkey[keyLen/4-1];
|
||||
word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
|
||||
(word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
|
||||
rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
|
||||
rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
|
||||
rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
|
||||
rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
|
||||
rkey[keyLen/4] = rkey[0] ^ x ^ *(rc++);
|
||||
rkey[keyLen/4+1] = rkey[1] ^ rkey[keyLen/4];
|
||||
rkey[keyLen/4+2] = rkey[2] ^ rkey[keyLen/4+1];
|
||||
rkey[keyLen/4+3] = rkey[3] ^ rkey[keyLen/4+2];
|
||||
|
||||
if (rk + keyLen/4 + 4 == end)
|
||||
if (rkey + keyLen/4 + 4 == end)
|
||||
break;
|
||||
|
||||
if (keyLen == 24)
|
||||
{
|
||||
rk[10] = rk[ 4] ^ rk[ 9];
|
||||
rk[11] = rk[ 5] ^ rk[10];
|
||||
rkey[10] = rkey[ 4] ^ rkey[ 9];
|
||||
rkey[11] = rkey[ 5] ^ rkey[10];
|
||||
}
|
||||
else if (keyLen == 32)
|
||||
{
|
||||
temp = rk[11];
|
||||
rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
|
||||
rk[13] = rk[ 5] ^ rk[12];
|
||||
rk[14] = rk[ 6] ^ rk[13];
|
||||
rk[15] = rk[ 7] ^ rk[14];
|
||||
temp = rkey[11];
|
||||
rkey[12] = rkey[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
|
||||
rkey[13] = rkey[ 5] ^ rkey[12];
|
||||
rkey[14] = rkey[ 6] ^ rkey[13];
|
||||
rkey[15] = rkey[ 7] ^ rkey[14];
|
||||
}
|
||||
rk += keyLen/4;
|
||||
rkey += keyLen/4;
|
||||
}
|
||||
|
||||
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
rk = rk_saved;
|
||||
rkey = rk;
|
||||
const uint8x16_p mask = ((uint8x16_p){12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3});
|
||||
const uint8x16_p zero = {0};
|
||||
|
||||
unsigned int i=0;
|
||||
for (i=0; i<rounds; i+=2, rk+=8)
|
||||
for (i=0; i<rounds; i+=2, rkey+=8)
|
||||
{
|
||||
const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rk);
|
||||
const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rk);
|
||||
vec_vsx_st(vec_perm(d1, zero, mask), 0, (uint8_t*)rk);
|
||||
vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rk);
|
||||
const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rkey);
|
||||
const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rkey);
|
||||
vec_vsx_st(vec_perm(d1, zero, mask), 0, (uint8_t*)rkey);
|
||||
vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rkey);
|
||||
}
|
||||
|
||||
for ( ; i<rounds+1; i++, rk+=4)
|
||||
vec_vsx_st(vec_perm(vec_vsx_ld(0, (uint8_t*)rk), zero, mask), 0, (uint8_t*)rk);
|
||||
for ( ; i<rounds+1; i++, rkey+=4)
|
||||
{
|
||||
const uint8x16_p d = vec_vsx_ld( 0, (uint8_t*)rkey);
|
||||
vec_vsx_st(vec_perm(d, zero, mask), 0, (uint8_t*)rkey);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user