mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 09:59:42 +00:00
Add 64-bit overload for VecLoadAligned
This commit is contained in:
parent
5017d9c91c
commit
57ba87bdc9
@ -223,6 +223,7 @@ inline __m128i RotateLeft<16>(const __m128i val)
|
||||
using CryptoPP::uint8x16_p;
|
||||
using CryptoPP::uint32x4_p;
|
||||
using CryptoPP::VecLoad;
|
||||
using CryptoPP::VecLoadAligned;
|
||||
using CryptoPP::VecStore;
|
||||
using CryptoPP::VecPermute;
|
||||
|
||||
@ -834,10 +835,10 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
|
||||
// time to better support distros.
|
||||
inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||
{
|
||||
const uint32x4_p state0 = VecLoad(state + 0*4);
|
||||
const uint32x4_p state1 = VecLoad(state + 1*4);
|
||||
const uint32x4_p state2 = VecLoad(state + 2*4);
|
||||
const uint32x4_p state3 = VecLoad(state + 3*4);
|
||||
const uint32x4_p state0 = VecLoadAligned(state + 0*4);
|
||||
const uint32x4_p state1 = VecLoadAligned(state + 1*4);
|
||||
const uint32x4_p state2 = VecLoadAligned(state + 2*4);
|
||||
const uint32x4_p state3 = VecLoadAligned(state + 3*4);
|
||||
|
||||
const uint32x4_p CTRS[3] = {
|
||||
{1,0,0,0}, {2,0,0,0}, {3,0,0,0}
|
||||
|
72
ppc_simd.h
72
ppc_simd.h
@ -413,7 +413,7 @@ inline uint32x4_p VecLoad(int off, const word32 src[4])
|
||||
/// \brief Loads a vector from a double word array
|
||||
/// \param src the double word array
|
||||
/// \details VecLoad() loads a vector from a double word array.
|
||||
/// \details VecLoad() uses POWER8's and VSX's <tt>vec_xl</tt> if available.
|
||||
/// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
|
||||
/// The instruction does not require aligned effective memory addresses.
|
||||
/// VecLoad_ALTIVEC() is used if POWER8 or VSX are not available.
|
||||
/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
|
||||
@ -452,7 +452,7 @@ inline uint64x2_p VecLoad(const word64 src[2])
|
||||
/// \param src the double word array
|
||||
/// \param off offset into the double word array
|
||||
/// \details VecLoad() loads a vector from a double word array.
|
||||
/// \details VecLoad() uses POWER8's and VSX's <tt>vec_xl</tt> if available.
|
||||
/// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
|
||||
/// The instruction does not require aligned effective memory addresses.
|
||||
/// VecLoad_ALTIVEC() is used if POWER8 or VSX are not available.
|
||||
/// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
|
||||
@ -605,6 +605,74 @@ inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
||||
|
||||
/// \brief Loads a vector from an aligned double word array
|
||||
/// \param src the double word array
|
||||
/// \details VecLoadAligned() loads a vector from an aligned double word array.
|
||||
/// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
|
||||
/// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
|
||||
/// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
|
||||
/// \par Wraps
|
||||
/// vec_ld, vec_xl
|
||||
/// \since Crypto++ 8.0
|
||||
inline uint64x2_p VecLoadAligned(const word64 src[4])
|
||||
{
|
||||
// Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
|
||||
// word pointers. The ISA lacks loads for short* and char*.
|
||||
// Power9/ISA 3.0 provides vec_xl for all datatypes.
|
||||
|
||||
// GCC and XLC use integer math for the effective address.
|
||||
// LLVM uses pointer math for the effective address.
|
||||
const uintptr_t eff = reinterpret_cast<uintptr_t>(src);
|
||||
CRYPTOPP_ASSERT(eff % 16 == 0);
|
||||
|
||||
#if defined(_ARCH_PWR9)
|
||||
return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
|
||||
#elif (defined(_ARCH_PWR7) && defined(__VSX__)) || defined(_ARCH_PWR8)
|
||||
// The 32-bit cast is not a typo. Compiler workaround.
|
||||
return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
|
||||
#else
|
||||
return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Loads a vector from an aligned double word array
|
||||
/// \param src the double word array
|
||||
/// \details VecLoadAligned() loads a vector from an aligned double word array.
|
||||
/// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
|
||||
/// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
|
||||
/// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
|
||||
/// \par Wraps
|
||||
/// vec_ld, vec_xl
|
||||
/// \since Crypto++ 8.0
|
||||
inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
|
||||
{
|
||||
// Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
|
||||
// word pointers. The ISA lacks loads for short* and char*.
|
||||
// Power9/ISA 3.0 provides vec_xl for all datatypes.
|
||||
|
||||
// GCC and XLC use integer math for the effective address.
|
||||
// LLVM uses pointer math for the effective address.
|
||||
const uintptr_t eff = reinterpret_cast<uintptr_t>(src)+off;
|
||||
CRYPTOPP_ASSERT(eff % 16 == 0);
|
||||
|
||||
#if defined(_ARCH_PWR9)
|
||||
return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
|
||||
#elif (defined(_ARCH_PWR7) && defined(__VSX__)) || defined(_ARCH_PWR8)
|
||||
# if defined(__clang__)
|
||||
// The 32-bit cast is not a typo. Compiler workaround.
|
||||
return (uint64x2_p)vec_xl(0, CONST_V32_CAST(eff));
|
||||
# else
|
||||
return (uint64x2_p)vec_xl(off, CONST_V32_CAST(src));
|
||||
# endif
|
||||
#else
|
||||
return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/// \brief Loads a vector from a byte array
|
||||
/// \param src the byte array
|
||||
/// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
|
||||
|
@ -537,6 +537,7 @@ using CryptoPP::uint64x2_p;
|
||||
using CryptoPP::VecAnd;
|
||||
using CryptoPP::VecXor;
|
||||
using CryptoPP::VecLoad;
|
||||
using CryptoPP::VecLoadAligned;
|
||||
using CryptoPP::VecPermute;
|
||||
|
||||
// Rotate left by bit count
|
||||
@ -578,8 +579,8 @@ inline void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigne
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint64x2_p rk1 = VecLoad(subkeys+i*2);
|
||||
const uint64x2_p rk2 = VecLoad(subkeys+i*2+2);
|
||||
const uint64x2_p rk1 = VecLoadAligned(subkeys+i*2);
|
||||
const uint64x2_p rk2 = VecLoadAligned(subkeys+i*2+2);
|
||||
|
||||
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
|
||||
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
|
||||
@ -588,7 +589,7 @@ inline void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigne
|
||||
if (rounds & 1)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint64x2_p rk = VecLoad(subkeys+rounds*2-2);
|
||||
const uint64x2_p rk = VecLoadAligned(subkeys+rounds*2-2);
|
||||
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
|
||||
std::swap(x1, y1);
|
||||
}
|
||||
@ -671,8 +672,8 @@ inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint64x2_p rk1 = VecLoad(subkeys+i*2);
|
||||
const uint64x2_p rk2 = VecLoad(subkeys+i*2+2);
|
||||
const uint64x2_p rk1 = VecLoadAligned(subkeys+i*2);
|
||||
const uint64x2_p rk2 = VecLoadAligned(subkeys+i*2+2);
|
||||
|
||||
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
|
||||
y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk1);
|
||||
@ -686,7 +687,7 @@ inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
if (rounds & 1)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint64x2_p rk = VecLoad(subkeys+rounds*2-2);
|
||||
const uint64x2_p rk = VecLoadAligned(subkeys+rounds*2-2);
|
||||
|
||||
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
|
||||
y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk);
|
||||
|
@ -471,6 +471,7 @@ using CryptoPP::VecAdd;
|
||||
using CryptoPP::VecSub;
|
||||
using CryptoPP::VecXor;
|
||||
using CryptoPP::VecLoad;
|
||||
using CryptoPP::VecLoadAligned;
|
||||
using CryptoPP::VecPermute;
|
||||
|
||||
// Rotate left by bit count
|
||||
@ -506,7 +507,7 @@ void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
|
||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint64x2_p rk = VecLoad(subkeys+i*2);
|
||||
const uint64x2_p rk = VecLoadAligned(subkeys+i*2);
|
||||
|
||||
x1 = RotateRight64<8>(x1);
|
||||
x1 = VecAdd(x1, y1);
|
||||
@ -588,7 +589,7 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||
{
|
||||
// Round keys are pre-splated in forward direction
|
||||
const uint64x2_p rk = VecLoad(subkeys+i*2);
|
||||
const uint64x2_p rk = VecLoadAligned(subkeys+i*2);
|
||||
|
||||
x1 = RotateRight64<8>(x1);
|
||||
x2 = RotateRight64<8>(x2);
|
||||
|
Loading…
Reference in New Issue
Block a user