mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-26 19:30:21 +00:00
Re-add Simon and Speck, enable NEON and Aarch64 (GH #585)
This commit re-adds Simon and Speck. The commit includes NEON, Aarch32 and Aarch64
This commit is contained in:
parent
5da795bf56
commit
e5a362c026
282
simon-simd.cpp
282
simon-simd.cpp
@ -64,6 +64,24 @@ using CryptoPP::vec_swap; // SunCC
|
||||
|
||||
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
|
||||
template <class T>
|
||||
inline T UnpackHigh32(const T& a, const T& b)
|
||||
{
|
||||
const uint32x2_t x(vget_high_u32((uint32x4_t)a));
|
||||
const uint32x2_t y(vget_high_u32((uint32x4_t)b));
|
||||
const uint32x2x2_t r = vzip_u32(x, y);
|
||||
return (T)vcombine_u32(r.val[0], r.val[1]);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline T UnpackLow32(const T& a, const T& b)
|
||||
{
|
||||
const uint32x2_t x(vget_low_u32((uint32x4_t)a));
|
||||
const uint32x2_t y(vget_low_u32((uint32x4_t)b));
|
||||
const uint32x2x2_t r = vzip_u32(x, y);
|
||||
return (T)vcombine_u32(r.val[0], r.val[1]);
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
||||
{
|
||||
@ -114,16 +132,6 @@ inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
|
||||
}
|
||||
#endif
|
||||
|
||||
inline uint32x4_t Shuffle32(const uint32x4_t& val)
|
||||
{
|
||||
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
return vreinterpretq_u32_u8(
|
||||
vrev32q_u8(vreinterpretq_u8_u32(val)));
|
||||
#else
|
||||
return val;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline uint32x4_t SIMON64_f(const uint32x4_t& val)
|
||||
{
|
||||
return veorq_u32(RotateLeft32<2>(val),
|
||||
@ -133,15 +141,13 @@ inline uint32x4_t SIMON64_f(const uint32x4_t& val)
|
||||
inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. If only a single block is available then
|
||||
// a Zero block is provided to promote vectorizations.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
@ -160,25 +166,21 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
|
||||
std::swap(x1, y1);
|
||||
}
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = vzipq_u32(x1, y1).val[0];
|
||||
block1 = vzipq_u32(x1, y1).val[1];
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. If only a single block is available then
|
||||
// a Zero block is provided to promote vectorizations.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
@ -198,32 +200,26 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
|
||||
}
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = vzipq_u32(x1, y1).val[0];
|
||||
block1 = vzipq_u32(x1, y1).val[1];
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. If only a single block is available then
|
||||
// a Zero block is provided to promote vectorizations.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
|
||||
{
|
||||
@ -248,38 +244,30 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||
}
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = vzipq_u32(x1, y1).val[0];
|
||||
block1 = vzipq_u32(x1, y1).val[1];
|
||||
block2 = vzipq_u32(x2, y2).val[0];
|
||||
block3 = vzipq_u32(x2, y2).val[1];
|
||||
block4 = vzipq_u32(x3, y3).val[0];
|
||||
block5 = vzipq_u32(x3, y3).val[1];
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
block2 = UnpackLow32(y2, x2);
|
||||
block3 = UnpackHigh32(y2, x2);
|
||||
block4 = UnpackLow32(y3, x3);
|
||||
block5 = UnpackHigh32(y3, x3);
|
||||
}
|
||||
|
||||
inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. If only a single block is available then
|
||||
// a Zero block is provided to promote vectorizations.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
@ -305,17 +293,13 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
|
||||
}
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = vzipq_u32(x1, y1).val[0];
|
||||
block1 = vzipq_u32(x1, y1).val[1];
|
||||
block2 = vzipq_u32(x2, y2).val[0];
|
||||
block3 = vzipq_u32(x2, y2).val[1];
|
||||
block4 = vzipq_u32(x3, y3).val[0];
|
||||
block5 = vzipq_u32(x3, y3).val[1];
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
block2 = UnpackLow32(y2, x2);
|
||||
block3 = UnpackHigh32(y2, x2);
|
||||
block4 = UnpackLow32(y3, x3);
|
||||
block5 = UnpackHigh32(y3, x3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
@ -388,16 +372,6 @@ inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
|
||||
}
|
||||
#endif
|
||||
|
||||
inline uint64x2_t Shuffle64(const uint64x2_t& val)
|
||||
{
|
||||
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
return vreinterpretq_u64_u8(
|
||||
vrev64q_u8(vreinterpretq_u8_u64(val)));
|
||||
#else
|
||||
return val;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline uint64x2_t SIMON128_f(const uint64x2_t& val)
|
||||
{
|
||||
return veorq_u64(RotateLeft64<2>(val),
|
||||
@ -407,14 +381,12 @@ inline uint64x2_t SIMON128_f(const uint64x2_t& val)
|
||||
inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint64x2_t x1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t y1 = UnpackHigh64(block0, block1);
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
uint64x2_t x1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t y1 = UnpackLow64(block0, block1);
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
@ -433,30 +405,25 @@ inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||
std::swap(x1, y1);
|
||||
}
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
|
||||
block0 = UnpackLow64(x1, y1);
|
||||
block1 = UnpackHigh64(x1, y1);
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = UnpackLow64(y1, x1);
|
||||
block1 = UnpackHigh64(y1, x1);
|
||||
}
|
||||
|
||||
inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||
uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint64x2_t x1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t y1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t x2 = UnpackLow64(block2, block3);
|
||||
uint64x2_t y2 = UnpackHigh64(block2, block3);
|
||||
uint64x2_t x3 = UnpackLow64(block4, block5);
|
||||
uint64x2_t y3 = UnpackHigh64(block4, block5);
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||
uint64x2_t x1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t y1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t x2 = UnpackHigh64(block2, block3);
|
||||
uint64x2_t y2 = UnpackLow64(block2, block3);
|
||||
uint64x2_t x3 = UnpackHigh64(block4, block5);
|
||||
uint64x2_t y3 = UnpackLow64(block4, block5);
|
||||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
|
||||
{
|
||||
@ -481,29 +448,24 @@ inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||
}
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||
|
||||
block0 = UnpackLow64(x1, y1);
|
||||
block1 = UnpackHigh64(x1, y1);
|
||||
block2 = UnpackLow64(x2, y2);
|
||||
block3 = UnpackHigh64(x2, y2);
|
||||
block4 = UnpackLow64(x3, y3);
|
||||
block5 = UnpackHigh64(x3, y3);
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = UnpackLow64(y1, x1);
|
||||
block1 = UnpackHigh64(y1, x1);
|
||||
block2 = UnpackLow64(y2, x2);
|
||||
block3 = UnpackHigh64(y2, x2);
|
||||
block4 = UnpackLow64(y3, x3);
|
||||
block5 = UnpackHigh64(y3, x3);
|
||||
}
|
||||
|
||||
inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint64x2_t x1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t y1 = UnpackHigh64(block0, block1);
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
uint64x2_t x1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t y1 = UnpackLow64(block0, block1);
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
@ -523,30 +485,25 @@ inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||
y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
|
||||
}
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
|
||||
block0 = UnpackLow64(x1, y1);
|
||||
block1 = UnpackHigh64(x1, y1);
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = UnpackLow64(y1, x1);
|
||||
block1 = UnpackHigh64(y1, x1);
|
||||
}
|
||||
|
||||
inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||
uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint64x2_t x1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t y1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t x2 = UnpackLow64(block2, block3);
|
||||
uint64x2_t y2 = UnpackHigh64(block2, block3);
|
||||
uint64x2_t x3 = UnpackLow64(block4, block5);
|
||||
uint64x2_t y3 = UnpackHigh64(block4, block5);
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||
uint64x2_t x1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t y1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t x2 = UnpackHigh64(block2, block3);
|
||||
uint64x2_t y2 = UnpackLow64(block2, block3);
|
||||
uint64x2_t x3 = UnpackHigh64(block4, block5);
|
||||
uint64x2_t y3 = UnpackLow64(block4, block5);
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
@ -572,16 +529,13 @@ inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||
y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
|
||||
}
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||
|
||||
block0 = UnpackLow64(x1, y1);
|
||||
block1 = UnpackHigh64(x1, y1);
|
||||
block2 = UnpackLow64(x2, y2);
|
||||
block3 = UnpackHigh64(x2, y2);
|
||||
block4 = UnpackLow64(x3, y3);
|
||||
block5 = UnpackHigh64(x3, y3);
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = UnpackLow64(y1, x1);
|
||||
block1 = UnpackHigh64(y1, x1);
|
||||
block2 = UnpackLow64(y2, x2);
|
||||
block3 = UnpackHigh64(y2, x2);
|
||||
block4 = UnpackLow64(y3, x3);
|
||||
block5 = UnpackHigh64(y3, x3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
@ -670,8 +624,8 @@ inline __m128i SIMON128_f(const __m128i& v)
|
||||
inline void GCC_NO_UBSAN SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
__m128i x1 = _mm_unpackhi_epi64(block0, block1);
|
||||
@ -706,8 +660,8 @@ inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
__m128i x1 = _mm_unpackhi_epi64(block0, block1);
|
||||
@ -754,8 +708,8 @@ inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
inline void GCC_NO_UBSAN SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
__m128i x1 = _mm_unpackhi_epi64(block0, block1);
|
||||
@ -791,8 +745,8 @@ inline void GCC_NO_UBSAN SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
__m128i x1 = _mm_unpackhi_epi64(block0, block1);
|
||||
@ -881,8 +835,8 @@ inline __m128i SIMON64_f(const __m128i& v)
|
||||
inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. Thanks to Peter Cordes for help with the
|
||||
// SSE permutes below.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
@ -916,8 +870,8 @@ inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
|
||||
inline void GCC_NO_UBSAN SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. Thanks to Peter Cordes for help with the
|
||||
// SSE permutes below.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
@ -953,8 +907,8 @@ inline void GCC_NO_UBSAN SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. Thanks to Peter Cordes for help with the
|
||||
// SSE permutes below.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
@ -1009,8 +963,8 @@ inline void GCC_NO_UBSAN SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. Thanks to Peter Cordes for help with the
|
||||
// SSE permutes below.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
|
4
simon.h
4
simon.h
@ -17,11 +17,11 @@
|
||||
#include "seckey.h"
|
||||
#include "secblock.h"
|
||||
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
|
||||
# define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
|
||||
# define CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS 1
|
||||
#endif
|
||||
|
||||
|
278
speck-simd.cpp
278
speck-simd.cpp
@ -61,6 +61,24 @@ using CryptoPP::word64;
|
||||
|
||||
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
|
||||
template <class T>
|
||||
inline T UnpackHigh32(const T& a, const T& b)
|
||||
{
|
||||
const uint32x2_t x(vget_high_u32((uint32x4_t)a));
|
||||
const uint32x2_t y(vget_high_u32((uint32x4_t)b));
|
||||
const uint32x2x2_t r = vzip_u32(x, y);
|
||||
return (T)vcombine_u32(r.val[0], r.val[1]);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline T UnpackLow32(const T& a, const T& b)
|
||||
{
|
||||
const uint32x2_t x(vget_low_u32((uint32x4_t)a));
|
||||
const uint32x2_t y(vget_low_u32((uint32x4_t)b));
|
||||
const uint32x2x2_t r = vzip_u32(x, y);
|
||||
return (T)vcombine_u32(r.val[0], r.val[1]);
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
||||
{
|
||||
@ -111,27 +129,15 @@ inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
|
||||
}
|
||||
#endif // Aarch32 or Aarch64
|
||||
|
||||
inline uint32x4_t Shuffle32(const uint32x4_t& val)
|
||||
{
|
||||
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
return vreinterpretq_u32_u8(
|
||||
vrev32q_u8(vreinterpretq_u8_u32(val)));
|
||||
#else
|
||||
return val;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
|
||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||
{
|
||||
@ -144,24 +150,20 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
y1 = veorq_u32(y1, x1);
|
||||
}
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = vzipq_u32(x1, y1).val[0];
|
||||
block1 = vzipq_u32(x1, y1).val[1];
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
@ -174,32 +176,26 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
x1 = RotateLeft32<8>(x1);
|
||||
}
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = vzipq_u32(x1, y1).val[0];
|
||||
block1 = vzipq_u32(x1, y1).val[1];
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
}
|
||||
|
||||
inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. If only a single block is available then
|
||||
// a Zero block is provided to promote vectorizations.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
|
||||
|
||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||
{
|
||||
@ -222,38 +218,30 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
y3 = veorq_u32(y3, x3);
|
||||
}
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = vzipq_u32(x1, y1).val[0];
|
||||
block1 = vzipq_u32(x1, y1).val[1];
|
||||
block2 = vzipq_u32(x2, y2).val[0];
|
||||
block3 = vzipq_u32(x2, y2).val[1];
|
||||
block4 = vzipq_u32(x3, y3).val[0];
|
||||
block5 = vzipq_u32(x3, y3).val[1];
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
block2 = UnpackLow32(y2, x2);
|
||||
block3 = UnpackHigh32(y2, x2);
|
||||
block4 = UnpackLow32(y3, x3);
|
||||
block5 = UnpackHigh32(y3, x3);
|
||||
}
|
||||
|
||||
inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. If only a single block is available then
|
||||
// a Zero block is provided to promote vectorizations.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
|
||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
|
||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
|
||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
|
||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
|
||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
@ -276,17 +264,13 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
x3 = RotateLeft32<8>(x3);
|
||||
}
|
||||
|
||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||
|
||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||
block0 = vzipq_u32(x1, y1).val[0];
|
||||
block1 = vzipq_u32(x1, y1).val[1];
|
||||
block2 = vzipq_u32(x2, y2).val[0];
|
||||
block3 = vzipq_u32(x2, y2).val[1];
|
||||
block4 = vzipq_u32(x3, y3).val[0];
|
||||
block5 = vzipq_u32(x3, y3).val[1];
|
||||
block0 = UnpackLow32(y1, x1);
|
||||
block1 = UnpackHigh32(y1, x1);
|
||||
block2 = UnpackLow32(y2, x2);
|
||||
block3 = UnpackHigh32(y2, x2);
|
||||
block4 = UnpackLow32(y3, x3);
|
||||
block5 = UnpackHigh32(y3, x3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
@ -359,27 +343,15 @@ inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
|
||||
}
|
||||
#endif
|
||||
|
||||
inline uint64x2_t Shuffle64(const uint64x2_t& val)
|
||||
{
|
||||
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
return vreinterpretq_u64_u8(
|
||||
vrev64q_u8(vreinterpretq_u8_u64(val)));
|
||||
#else
|
||||
return val;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint64x2_t x1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t y1 = UnpackHigh64(block0, block1);
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
uint64x2_t x1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t y1 = UnpackLow64(block0, block1);
|
||||
|
||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||
{
|
||||
@ -392,31 +364,25 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||
y1 = veorq_u64(y1, x1);
|
||||
}
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = UnpackLow64(x1, y1);
|
||||
block1 = UnpackHigh64(x1, y1);
|
||||
block0 = UnpackLow64(y1, x1);
|
||||
block1 = UnpackHigh64(y1, x1);
|
||||
}
|
||||
|
||||
inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||
uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint64x2_t x1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t y1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t x2 = UnpackLow64(block2, block3);
|
||||
uint64x2_t y2 = UnpackHigh64(block2, block3);
|
||||
uint64x2_t x3 = UnpackLow64(block4, block5);
|
||||
uint64x2_t y3 = UnpackHigh64(block4, block5);
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||
uint64x2_t x1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t y1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t x2 = UnpackHigh64(block2, block3);
|
||||
uint64x2_t y2 = UnpackLow64(block2, block3);
|
||||
uint64x2_t x3 = UnpackHigh64(block4, block5);
|
||||
uint64x2_t y3 = UnpackLow64(block4, block5);
|
||||
|
||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||
{
|
||||
@ -439,30 +405,24 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||
y3 = veorq_u64(y3, x3);
|
||||
}
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = UnpackLow64(x1, y1);
|
||||
block1 = UnpackHigh64(x1, y1);
|
||||
block2 = UnpackLow64(x2, y2);
|
||||
block3 = UnpackHigh64(x2, y2);
|
||||
block4 = UnpackLow64(x3, y3);
|
||||
block5 = UnpackHigh64(x3, y3);
|
||||
block0 = UnpackLow64(y1, x1);
|
||||
block1 = UnpackHigh64(y1, x1);
|
||||
block2 = UnpackLow64(y2, x2);
|
||||
block3 = UnpackHigh64(y2, x2);
|
||||
block4 = UnpackLow64(y3, x3);
|
||||
block5 = UnpackHigh64(y3, x3);
|
||||
}
|
||||
|
||||
inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint64x2_t x1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t y1 = UnpackHigh64(block0, block1);
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
uint64x2_t x1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t y1 = UnpackLow64(block0, block1);
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
@ -475,31 +435,25 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||
x1 = RotateLeft64<8>(x1);
|
||||
}
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = UnpackLow64(x1, y1);
|
||||
block1 = UnpackHigh64(x1, y1);
|
||||
block0 = UnpackLow64(y1, x1);
|
||||
block1 = UnpackHigh64(y1, x1);
|
||||
}
|
||||
|
||||
inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||
uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
uint64x2_t x1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t y1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t x2 = UnpackLow64(block2, block3);
|
||||
uint64x2_t y2 = UnpackHigh64(block2, block3);
|
||||
uint64x2_t x3 = UnpackLow64(block4, block5);
|
||||
uint64x2_t y3 = UnpackHigh64(block4, block5);
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||
uint64x2_t x1 = UnpackHigh64(block0, block1);
|
||||
uint64x2_t y1 = UnpackLow64(block0, block1);
|
||||
uint64x2_t x2 = UnpackHigh64(block2, block3);
|
||||
uint64x2_t y2 = UnpackLow64(block2, block3);
|
||||
uint64x2_t x3 = UnpackHigh64(block4, block5);
|
||||
uint64x2_t y3 = UnpackLow64(block4, block5);
|
||||
|
||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||
{
|
||||
@ -522,17 +476,13 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||
x3 = RotateLeft64<8>(x3);
|
||||
}
|
||||
|
||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||
|
||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||
block0 = UnpackLow64(x1, y1);
|
||||
block1 = UnpackHigh64(x1, y1);
|
||||
block2 = UnpackLow64(x2, y2);
|
||||
block3 = UnpackHigh64(x2, y2);
|
||||
block4 = UnpackLow64(x3, y3);
|
||||
block5 = UnpackHigh64(x3, y3);
|
||||
block0 = UnpackLow64(y1, x1);
|
||||
block1 = UnpackHigh64(y1, x1);
|
||||
block2 = UnpackLow64(y2, x2);
|
||||
block3 = UnpackHigh64(y2, x2);
|
||||
block4 = UnpackLow64(y3, x3);
|
||||
block5 = UnpackHigh64(y3, x3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
@ -605,8 +555,8 @@ inline __m128i RotateRight64<8>(const __m128i& val)
|
||||
inline void GCC_NO_UBSAN SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
__m128i x1 = _mm_unpackhi_epi64(block0, block1);
|
||||
@ -633,8 +583,8 @@ inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
__m128i x1 = _mm_unpackhi_epi64(block0, block1);
|
||||
@ -678,8 +628,8 @@ inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
inline void GCC_NO_UBSAN SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
__m128i x1 = _mm_unpackhi_epi64(block0, block1);
|
||||
@ -706,8 +656,8 @@ inline void GCC_NO_UBSAN SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word64 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following.
|
||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||
__m128i x1 = _mm_unpackhi_epi64(block0, block1);
|
||||
@ -785,8 +735,8 @@ inline __m128i RotateRight32<8>(const __m128i& val)
|
||||
inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. Thanks to Peter Cordes for help with the
|
||||
// SSE permutes below.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
@ -815,8 +765,8 @@ inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
|
||||
inline void GCC_NO_UBSAN SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. Thanks to Peter Cordes for help with the
|
||||
// SSE permutes below.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
@ -846,8 +796,8 @@ inline void GCC_NO_UBSAN SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. Thanks to Peter Cordes for help with the
|
||||
// SSE permutes below.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
@ -901,8 +851,8 @@ inline void GCC_NO_UBSAN SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||
__m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
// Rearrange the data for vectorization. The incoming data was read from
|
||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||
// Rearrange the data for vectorization. The incoming data was read into
|
||||
// a little-endian word array. Depending on the number of blocks it needs to
|
||||
// be permuted to the following. Thanks to Peter Cordes for help with the
|
||||
// SSE permutes below.
|
||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||
|
4
speck.h
4
speck.h
@ -17,11 +17,11 @@
|
||||
#include "seckey.h"
|
||||
#include "secblock.h"
|
||||
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
|
||||
# define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
|
||||
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
|
||||
# define CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS 1
|
||||
#endif
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user