mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 09:59:42 +00:00
Fix Simon-64 CTR mode
This fixes CTR mode for Simon-64. We were only incrementing half the counters. We still have Speck-64 to cleanup.
This commit is contained in:
parent
07f2a4fc3f
commit
02037b5ce6
115
simon-simd.cpp
115
simon-simd.cpp
@ -52,9 +52,13 @@ using CryptoPP::BlockTransformation;
|
||||
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
|
||||
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
const word32 s_one64[] = {0, 1<<24, 0, 1<<24};
|
||||
const word32 s_zero[] = {0, 0, 0, 0};
|
||||
const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; // Only second 8-byte block is incremented after loading
|
||||
const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; // Routine step. Both 8-byte block are incremented
|
||||
#else
|
||||
const word32 s_one64[] = {0, 1, 0, 1};
|
||||
const word32 s_zero[] = {0, 0, 0, 0};
|
||||
const word32 s_one64_1b[] = {0, 0, 0, 1};
|
||||
const word32 s_one64_2b[] = {0, 2, 0, 2};
|
||||
#endif
|
||||
|
||||
template <unsigned int R>
|
||||
@ -125,30 +129,6 @@ inline uint32x4_t SIMON64_f(const uint32x4_t& val)
|
||||
vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline word32* Ptr32(T* ptr)
|
||||
{
|
||||
return reinterpret_cast<word32*>(ptr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const word32* Ptr32(const T* ptr)
|
||||
{
|
||||
return reinterpret_cast<const word32*>(ptr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline word64* Ptr64(T* ptr)
|
||||
{
|
||||
return reinterpret_cast<word64*>(ptr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const word64* Ptr64(const T* ptr)
|
||||
{
|
||||
return reinterpret_cast<const word64*>(ptr);
|
||||
}
|
||||
|
||||
inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
@ -388,25 +368,40 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
|
||||
|
||||
if (flags & BlockTransformation::BT_AllowParallel)
|
||||
{
|
||||
// Load these magic values once. Analysis claims be1 and be2
|
||||
// may be uninitialized, but they are when the block is a ctr.
|
||||
uint32x4_t be1, be2;
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
{
|
||||
be1 = vld1q_u32(s_one64_1b);
|
||||
be2 = vld1q_u32(s_one64_2b);
|
||||
}
|
||||
|
||||
while (length >= 6*neonBlockSize)
|
||||
{
|
||||
uint32x4_t block0, block1, block2, block3, block4, block5;
|
||||
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
|
||||
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
{
|
||||
const uint32x4_t be1 = vld1q_u32(s_one64);
|
||||
block1 = vaddq_u32(block0, be1);
|
||||
block2 = vaddq_u32(block1, be1);
|
||||
block3 = vaddq_u32(block2, be1);
|
||||
block4 = vaddq_u32(block3, be1);
|
||||
block5 = vaddq_u32(block4, be1);
|
||||
vst1q_u8(const_cast<byte*>(inBlocks),
|
||||
vreinterpretq_u8_u32(vaddq_u32(block5, be1)));
|
||||
// For 64-bit block ciphers we need to load the initial single CTR block.
|
||||
// After the dup load we have two counters in the XMM word. Then we need
|
||||
// to increment the low ctr by 0 and the high ctr by 1.
|
||||
block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
|
||||
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
|
||||
|
||||
// After initial increment of {0,1} remaining counters increment by {1,1}.
|
||||
block1 = vaddq_u32(be2, block0);
|
||||
block2 = vaddq_u32(be2, block1);
|
||||
block3 = vaddq_u32(be2, block2);
|
||||
block4 = vaddq_u32(be2, block3);
|
||||
block5 = vaddq_u32(be2, block4);
|
||||
|
||||
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
|
||||
vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
|
||||
}
|
||||
else
|
||||
{
|
||||
const int inc = static_cast<int>(inIncrement);
|
||||
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
|
||||
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
|
||||
block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc));
|
||||
block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc));
|
||||
@ -456,18 +451,24 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
|
||||
while (length >= 2*neonBlockSize)
|
||||
{
|
||||
uint32x4_t block0, block1;
|
||||
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
|
||||
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
{
|
||||
const uint32x4_t be1 = vld1q_u32(s_one64);
|
||||
block1 = vaddq_u32(block0, be1);
|
||||
vst1q_u8(const_cast<byte*>(inBlocks),
|
||||
vreinterpretq_u8_u32(vaddq_u32(block1, be1)));
|
||||
// For 64-bit block ciphers we need to load the initial single CTR block.
|
||||
// After the dup load we have two counters in the XMM word. Then we need
|
||||
// to increment the low ctr by 0 and the high ctr by 1.
|
||||
block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
|
||||
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
|
||||
|
||||
// After initial increment of {0,1} remaining counters increment by {1,1}.
|
||||
block1 = vaddq_u32(be2, block0);
|
||||
|
||||
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
|
||||
vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
|
||||
}
|
||||
else
|
||||
{
|
||||
const int inc = static_cast<int>(inIncrement);
|
||||
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
|
||||
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
|
||||
inBlocks += 2*inc;
|
||||
}
|
||||
@ -521,16 +522,14 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
|
||||
|
||||
while (length >= blockSize)
|
||||
{
|
||||
uint32x4_t block, zero = {0,0,0,0};
|
||||
block = vsetq_lane_u32(Ptr32(inBlocks)[0], block, 0);
|
||||
block = vsetq_lane_u32(Ptr32(inBlocks)[1], block, 1);
|
||||
uint32x4_t zero = vld1q_u32(s_zero);
|
||||
uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64(
|
||||
reinterpret_cast<const word64*>(inBlocks)));
|
||||
|
||||
if (flags & BlockTransformation::BT_XorInput)
|
||||
{
|
||||
uint32x4_t x;
|
||||
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0);
|
||||
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
|
||||
block = veorq_u32(block, x);
|
||||
block = veorq_u32(block, vreinterpretq_u32_u64(
|
||||
vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
|
||||
}
|
||||
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
@ -540,16 +539,12 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
|
||||
|
||||
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
|
||||
{
|
||||
uint32x4_t x;
|
||||
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0);
|
||||
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
|
||||
block = veorq_u32(block, x);
|
||||
block = veorq_u32(block, vreinterpretq_u32_u64(
|
||||
vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
|
||||
}
|
||||
|
||||
word32 t[2];
|
||||
t[0] = vgetq_lane_u32(block, 0);
|
||||
t[1] = vgetq_lane_u32(block, 1);
|
||||
std::memcpy(outBlocks, t, sizeof(t));
|
||||
vst1_u8(const_cast<byte*>(outBlocks),
|
||||
vget_low_u8(vreinterpretq_u8_u32(block)));
|
||||
|
||||
inBlocks += inIncrement;
|
||||
outBlocks += outIncrement;
|
||||
@ -1762,7 +1757,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
|
||||
|
||||
if (flags & BlockTransformation::BT_AllowParallel)
|
||||
{
|
||||
// Load these magic value once. Analysis claims be1 and be2
|
||||
// Load these magic values once. Analysis claims be1 and be2
|
||||
// may be uninitialized, but they are when the block is a ctr.
|
||||
__m128i be1, be2;
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
@ -1782,7 +1777,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
|
||||
block0 = _mm_add_epi32(be1, _mm_castpd_si128(
|
||||
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
|
||||
|
||||
// After initial increment both counters increment by 1.
|
||||
// After initial increment of {0,1} remaining counters increment by {1,1}.
|
||||
block1 = _mm_add_epi32(be2, block0);
|
||||
block2 = _mm_add_epi32(be2, block1);
|
||||
block3 = _mm_add_epi32(be2, block2);
|
||||
@ -1872,7 +1867,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
|
||||
block0 = _mm_add_epi32(be1, _mm_castpd_si128(
|
||||
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
|
||||
|
||||
// After initial increment both counters increment by 1.
|
||||
// After initial increment of {0,1} remaining counters increment by {1,1}.
|
||||
block1 = _mm_add_epi32(be2, block0);
|
||||
|
||||
// Store the next counter.
|
||||
|
115
speck-simd.cpp
115
speck-simd.cpp
@ -50,9 +50,13 @@ using CryptoPP::BlockTransformation;
|
||||
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
|
||||
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
const word32 s_one64[] = {0, 1<<24, 0, 2<<24};
|
||||
const word32 s_zero[] = {0, 0, 0, 0};
|
||||
const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; // Only second 8-byte block is incremented after loading
|
||||
const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; // Routine step. Both 8-byte block are incremented
|
||||
#else
|
||||
const word32 s_one64[] = {0, 2, 0, 1};
|
||||
const word32 s_zero[] = {0, 0, 0, 0};
|
||||
const word32 s_one64_1b[] = {0, 0, 0, 1};
|
||||
const word32 s_one64_2b[] = {0, 2, 0, 2};
|
||||
#endif
|
||||
|
||||
template <unsigned int R>
|
||||
@ -117,30 +121,6 @@ inline uint32x4_t Shuffle32(const uint32x4_t& val)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline word32* Ptr32(T* ptr)
|
||||
{
|
||||
return reinterpret_cast<word32*>(ptr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const word32* Ptr32(const T* ptr)
|
||||
{
|
||||
return reinterpret_cast<const word32*>(ptr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline word64* Ptr64(T* ptr)
|
||||
{
|
||||
return reinterpret_cast<word64*>(ptr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const word64* Ptr64(const T* ptr)
|
||||
{
|
||||
return reinterpret_cast<const word64*>(ptr);
|
||||
}
|
||||
|
||||
inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
@ -360,25 +340,40 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
|
||||
|
||||
if (flags & BlockTransformation::BT_AllowParallel)
|
||||
{
|
||||
// Load these magic values once. Analysis claims be1 and be2
|
||||
// may be uninitialized, but they are when the block is a ctr.
|
||||
uint32x4_t be1, be2;
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
{
|
||||
be1 = vld1q_u32(s_one64_1b);
|
||||
be2 = vld1q_u32(s_one64_2b);
|
||||
}
|
||||
|
||||
while (length >= 6*neonBlockSize)
|
||||
{
|
||||
uint32x4_t block0, block1, block2, block3, block4, block5;
|
||||
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
|
||||
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
{
|
||||
const uint32x4_t be1 = vld1q_u32(s_one64);
|
||||
block1 = vaddq_u32(block0, be1);
|
||||
block2 = vaddq_u32(block1, be1);
|
||||
block3 = vaddq_u32(block2, be1);
|
||||
block4 = vaddq_u32(block3, be1);
|
||||
block5 = vaddq_u32(block4, be1);
|
||||
vst1q_u8(const_cast<byte*>(inBlocks),
|
||||
vreinterpretq_u8_u32(vaddq_u32(block5, be1)));
|
||||
// For 64-bit block ciphers we need to load the initial single CTR block.
|
||||
// After the dup load we have two counters in the XMM word. Then we need
|
||||
// to increment the low ctr by 0 and the high ctr by 1.
|
||||
block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
|
||||
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
|
||||
|
||||
// After initial increment of {0,1} remaining counters increment by {1,1}.
|
||||
block1 = vaddq_u32(be2, block0);
|
||||
block2 = vaddq_u32(be2, block1);
|
||||
block3 = vaddq_u32(be2, block2);
|
||||
block4 = vaddq_u32(be2, block3);
|
||||
block5 = vaddq_u32(be2, block4);
|
||||
|
||||
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
|
||||
vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
|
||||
}
|
||||
else
|
||||
{
|
||||
const int inc = static_cast<int>(inIncrement);
|
||||
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
|
||||
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
|
||||
block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc));
|
||||
block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc));
|
||||
@ -428,18 +423,24 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
|
||||
while (length >= 2*neonBlockSize)
|
||||
{
|
||||
uint32x4_t block0, block1;
|
||||
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
|
||||
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
{
|
||||
const uint32x4_t be1 = vld1q_u32(s_one64);
|
||||
block1 = vaddq_u32(block0, be1);
|
||||
vst1q_u8(const_cast<byte*>(inBlocks),
|
||||
vreinterpretq_u8_u32(vaddq_u32(block1, be1)));
|
||||
// For 64-bit block ciphers we need to load the initial single CTR block.
|
||||
// After the dup load we have two counters in the XMM word. Then we need
|
||||
// to increment the low ctr by 0 and the high ctr by 1.
|
||||
block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
|
||||
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
|
||||
|
||||
// After initial increment of {0,1} remaining counters increment by {1,1}.
|
||||
block1 = vaddq_u32(be2, block0);
|
||||
|
||||
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
|
||||
vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
|
||||
}
|
||||
else
|
||||
{
|
||||
const int inc = static_cast<int>(inIncrement);
|
||||
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
|
||||
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
|
||||
inBlocks += 2*inc;
|
||||
}
|
||||
@ -493,16 +494,14 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
|
||||
|
||||
while (length >= blockSize)
|
||||
{
|
||||
uint32x4_t block, zero = {0,0,0,0};
|
||||
block = vsetq_lane_u32(Ptr32(inBlocks)[0], block, 0);
|
||||
block = vsetq_lane_u32(Ptr32(inBlocks)[1], block, 1);
|
||||
uint32x4_t zero = vld1q_u32(s_zero);
|
||||
uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64(
|
||||
reinterpret_cast<const word64*>(inBlocks)));
|
||||
|
||||
if (flags & BlockTransformation::BT_XorInput)
|
||||
{
|
||||
uint32x4_t x;
|
||||
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0);
|
||||
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
|
||||
block = veorq_u32(block, x);
|
||||
block = veorq_u32(block, vreinterpretq_u32_u64(
|
||||
vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
|
||||
}
|
||||
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
@ -512,16 +511,12 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
|
||||
|
||||
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
|
||||
{
|
||||
uint32x4_t x;
|
||||
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0);
|
||||
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
|
||||
block = veorq_u32(block, x);
|
||||
block = veorq_u32(block, vreinterpretq_u32_u64(
|
||||
vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
|
||||
}
|
||||
|
||||
word32 t[2];
|
||||
t[0] = vgetq_lane_u32(block, 0);
|
||||
t[1] = vgetq_lane_u32(block, 1);
|
||||
std::memcpy(outBlocks, t, sizeof(t));
|
||||
vst1_u8(const_cast<byte*>(outBlocks),
|
||||
vget_low_u8(vreinterpretq_u8_u32(block)));
|
||||
|
||||
inBlocks += inIncrement;
|
||||
outBlocks += outIncrement;
|
||||
@ -1658,7 +1653,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
|
||||
|
||||
if (flags & BlockTransformation::BT_AllowParallel)
|
||||
{
|
||||
// Load these magic value once. Analysis claims be1 and be2
|
||||
// Load these magic values once. Analysis claims be1 and be2
|
||||
// may be uninitialized, but they are when the block is a ctr.
|
||||
__m128i be1, be2;
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
@ -1678,7 +1673,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
|
||||
block0 = _mm_add_epi32(be1, _mm_castpd_si128(
|
||||
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
|
||||
|
||||
// After initial increment both counters increment by 1.
|
||||
// After initial increment of {0,1} remaining counters increment by {1,1}.
|
||||
block1 = _mm_add_epi32(be2, block0);
|
||||
block2 = _mm_add_epi32(be2, block1);
|
||||
block3 = _mm_add_epi32(be2, block2);
|
||||
@ -1768,7 +1763,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
|
||||
block0 = _mm_add_epi32(be1, _mm_castpd_si128(
|
||||
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
|
||||
|
||||
// After initial increment both counters increment by 1.
|
||||
// After initial increment of {0,1} remaining counters increment by {1,1}.
|
||||
block1 = _mm_add_epi32(be2, block0);
|
||||
|
||||
// Store the next counter.
|
||||
|
Loading…
Reference in New Issue
Block a user