Fix Simon-64 CTR mode

This fixes CTR mode for Simon-64. We were only incrementing half the counters.

We still have Speck-64 to cleanup.
This commit is contained in:
Jeffrey Walton 2017-12-07 19:45:32 -05:00
parent 07f2a4fc3f
commit 02037b5ce6
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 110 additions and 120 deletions

View File

@ -52,9 +52,13 @@ using CryptoPP::BlockTransformation;
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
#if defined(CRYPTOPP_LITTLE_ENDIAN)
const word32 s_one64[] = {0, 1<<24, 0, 1<<24};
const word32 s_zero[] = {0, 0, 0, 0};
const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; // Only second 8-byte block is incremented after loading
const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; // Routine step. Both 8-byte block are incremented
#else
const word32 s_one64[] = {0, 1, 0, 1};
const word32 s_zero[] = {0, 0, 0, 0};
const word32 s_one64_1b[] = {0, 0, 0, 1};
const word32 s_one64_2b[] = {0, 2, 0, 2};
#endif
template <unsigned int R>
@ -125,30 +129,6 @@ inline uint32x4_t SIMON64_f(const uint32x4_t& val)
vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
}
template <typename T>
inline word32* Ptr32(T* ptr)
{
return reinterpret_cast<word32*>(ptr);
}
template <typename T>
inline const word32* Ptr32(const T* ptr)
{
return reinterpret_cast<const word32*>(ptr);
}
template <typename T>
inline word64* Ptr64(T* ptr)
{
return reinterpret_cast<word64*>(ptr);
}
template <typename T>
inline const word64* Ptr64(const T* ptr)
{
return reinterpret_cast<const word64*>(ptr);
}
inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
const word32 *subkeys, unsigned int rounds)
{
@ -388,25 +368,40 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
if (flags & BlockTransformation::BT_AllowParallel)
{
// Load these magic values once. Analysis claims be1 and be2
// may be uninitialized, but they are when the block is a ctr.
uint32x4_t be1, be2;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
be1 = vld1q_u32(s_one64_1b);
be2 = vld1q_u32(s_one64_2b);
}
while (length >= 6*neonBlockSize)
{
uint32x4_t block0, block1, block2, block3, block4, block5;
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const uint32x4_t be1 = vld1q_u32(s_one64);
block1 = vaddq_u32(block0, be1);
block2 = vaddq_u32(block1, be1);
block3 = vaddq_u32(block2, be1);
block4 = vaddq_u32(block3, be1);
block5 = vaddq_u32(block4, be1);
vst1q_u8(const_cast<byte*>(inBlocks),
vreinterpretq_u8_u32(vaddq_u32(block5, be1)));
// For 64-bit block ciphers we need to load the initial single CTR block.
// After the dup load we have two counters in the XMM word. Then we need
// to increment the low ctr by 0 and the high ctr by 1.
block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = vaddq_u32(be2, block0);
block2 = vaddq_u32(be2, block1);
block3 = vaddq_u32(be2, block2);
block4 = vaddq_u32(be2, block3);
block5 = vaddq_u32(be2, block4);
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
}
else
{
const int inc = static_cast<int>(inIncrement);
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc));
block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc));
@ -456,18 +451,24 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
while (length >= 2*neonBlockSize)
{
uint32x4_t block0, block1;
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const uint32x4_t be1 = vld1q_u32(s_one64);
block1 = vaddq_u32(block0, be1);
vst1q_u8(const_cast<byte*>(inBlocks),
vreinterpretq_u8_u32(vaddq_u32(block1, be1)));
// For 64-bit block ciphers we need to load the initial single CTR block.
// After the dup load we have two counters in the XMM word. Then we need
// to increment the low ctr by 0 and the high ctr by 1.
block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = vaddq_u32(be2, block0);
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
}
else
{
const int inc = static_cast<int>(inIncrement);
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
inBlocks += 2*inc;
}
@ -521,16 +522,14 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
while (length >= blockSize)
{
uint32x4_t block, zero = {0,0,0,0};
block = vsetq_lane_u32(Ptr32(inBlocks)[0], block, 0);
block = vsetq_lane_u32(Ptr32(inBlocks)[1], block, 1);
uint32x4_t zero = vld1q_u32(s_zero);
uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64(
reinterpret_cast<const word64*>(inBlocks)));
if (flags & BlockTransformation::BT_XorInput)
{
uint32x4_t x;
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0);
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
block = veorq_u32(block, x);
block = veorq_u32(block, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
}
if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -540,16 +539,12 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
uint32x4_t x;
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0);
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
block = veorq_u32(block, x);
block = veorq_u32(block, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
}
word32 t[2];
t[0] = vgetq_lane_u32(block, 0);
t[1] = vgetq_lane_u32(block, 1);
std::memcpy(outBlocks, t, sizeof(t));
vst1_u8(const_cast<byte*>(outBlocks),
vget_low_u8(vreinterpretq_u8_u32(block)));
inBlocks += inIncrement;
outBlocks += outIncrement;
@ -1762,7 +1757,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
if (flags & BlockTransformation::BT_AllowParallel)
{
// Load these magic value once. Analysis claims be1 and be2
// Load these magic values once. Analysis claims be1 and be2
// may be uninitialized, but they are when the block is a ctr.
__m128i be1, be2;
if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -1782,7 +1777,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
block0 = _mm_add_epi32(be1, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
// After initial increment both counters increment by 1.
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = _mm_add_epi32(be2, block0);
block2 = _mm_add_epi32(be2, block1);
block3 = _mm_add_epi32(be2, block2);
@ -1872,7 +1867,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
block0 = _mm_add_epi32(be1, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
// After initial increment both counters increment by 1.
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = _mm_add_epi32(be2, block0);
// Store the next counter.

View File

@ -50,9 +50,13 @@ using CryptoPP::BlockTransformation;
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
#if defined(CRYPTOPP_LITTLE_ENDIAN)
const word32 s_one64[] = {0, 1<<24, 0, 2<<24};
const word32 s_zero[] = {0, 0, 0, 0};
const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; // Only second 8-byte block is incremented after loading
const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; // Routine step. Both 8-byte block are incremented
#else
const word32 s_one64[] = {0, 2, 0, 1};
const word32 s_zero[] = {0, 0, 0, 0};
const word32 s_one64_1b[] = {0, 0, 0, 1};
const word32 s_one64_2b[] = {0, 2, 0, 2};
#endif
template <unsigned int R>
@ -117,30 +121,6 @@ inline uint32x4_t Shuffle32(const uint32x4_t& val)
#endif
}
template <typename T>
inline word32* Ptr32(T* ptr)
{
return reinterpret_cast<word32*>(ptr);
}
template <typename T>
inline const word32* Ptr32(const T* ptr)
{
return reinterpret_cast<const word32*>(ptr);
}
template <typename T>
inline word64* Ptr64(T* ptr)
{
return reinterpret_cast<word64*>(ptr);
}
template <typename T>
inline const word64* Ptr64(const T* ptr)
{
return reinterpret_cast<const word64*>(ptr);
}
inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
const word32 *subkeys, unsigned int rounds)
{
@ -360,25 +340,40 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
if (flags & BlockTransformation::BT_AllowParallel)
{
// Load these magic values once. Analysis claims be1 and be2
// may be uninitialized, but they are when the block is a ctr.
uint32x4_t be1, be2;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
be1 = vld1q_u32(s_one64_1b);
be2 = vld1q_u32(s_one64_2b);
}
while (length >= 6*neonBlockSize)
{
uint32x4_t block0, block1, block2, block3, block4, block5;
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const uint32x4_t be1 = vld1q_u32(s_one64);
block1 = vaddq_u32(block0, be1);
block2 = vaddq_u32(block1, be1);
block3 = vaddq_u32(block2, be1);
block4 = vaddq_u32(block3, be1);
block5 = vaddq_u32(block4, be1);
vst1q_u8(const_cast<byte*>(inBlocks),
vreinterpretq_u8_u32(vaddq_u32(block5, be1)));
// For 64-bit block ciphers we need to load the initial single CTR block.
// After the dup load we have two counters in the XMM word. Then we need
// to increment the low ctr by 0 and the high ctr by 1.
block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = vaddq_u32(be2, block0);
block2 = vaddq_u32(be2, block1);
block3 = vaddq_u32(be2, block2);
block4 = vaddq_u32(be2, block3);
block5 = vaddq_u32(be2, block4);
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
}
else
{
const int inc = static_cast<int>(inIncrement);
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc));
block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc));
@ -428,18 +423,24 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
while (length >= 2*neonBlockSize)
{
uint32x4_t block0, block1;
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const uint32x4_t be1 = vld1q_u32(s_one64);
block1 = vaddq_u32(block0, be1);
vst1q_u8(const_cast<byte*>(inBlocks),
vreinterpretq_u8_u32(vaddq_u32(block1, be1)));
// For 64-bit block ciphers we need to load the initial single CTR block.
// After the dup load we have two counters in the XMM word. Then we need
// to increment the low ctr by 0 and the high ctr by 1.
block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = vaddq_u32(be2, block0);
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
}
else
{
const int inc = static_cast<int>(inIncrement);
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
inBlocks += 2*inc;
}
@ -493,16 +494,14 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
while (length >= blockSize)
{
uint32x4_t block, zero = {0,0,0,0};
block = vsetq_lane_u32(Ptr32(inBlocks)[0], block, 0);
block = vsetq_lane_u32(Ptr32(inBlocks)[1], block, 1);
uint32x4_t zero = vld1q_u32(s_zero);
uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64(
reinterpret_cast<const word64*>(inBlocks)));
if (flags & BlockTransformation::BT_XorInput)
{
uint32x4_t x;
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0);
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
block = veorq_u32(block, x);
block = veorq_u32(block, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
}
if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -512,16 +511,12 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
uint32x4_t x;
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0);
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
block = veorq_u32(block, x);
block = veorq_u32(block, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
}
word32 t[2];
t[0] = vgetq_lane_u32(block, 0);
t[1] = vgetq_lane_u32(block, 1);
std::memcpy(outBlocks, t, sizeof(t));
vst1_u8(const_cast<byte*>(outBlocks),
vget_low_u8(vreinterpretq_u8_u32(block)));
inBlocks += inIncrement;
outBlocks += outIncrement;
@ -1658,7 +1653,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
if (flags & BlockTransformation::BT_AllowParallel)
{
// Load these magic value once. Analysis claims be1 and be2
// Load these magic values once. Analysis claims be1 and be2
// may be uninitialized, but they are when the block is a ctr.
__m128i be1, be2;
if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -1678,7 +1673,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
block0 = _mm_add_epi32(be1, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
// After initial increment both counters increment by 1.
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = _mm_add_epi32(be2, block0);
block2 = _mm_add_epi32(be2, block1);
block3 = _mm_add_epi32(be2, block2);
@ -1768,7 +1763,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
block0 = _mm_add_epi32(be1, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
// After initial increment both counters increment by 1.
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = _mm_add_epi32(be2, block0);
// Store the next counter.