From 02037b5ce659269579b6556ec6eea20929d71253 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Thu, 7 Dec 2017 19:45:32 -0500 Subject: [PATCH] Fix Simon-64 CTR mode This fixes CTR mode for Simon-64. We were only incrementing half the counters. We still have Speck-64 to cleanup. --- simon-simd.cpp | 115 +++++++++++++++++++++++-------------------------- speck-simd.cpp | 115 +++++++++++++++++++++++-------------------------- 2 files changed, 110 insertions(+), 120 deletions(-) diff --git a/simon-simd.cpp b/simon-simd.cpp index 17883fc9..d0738398 100644 --- a/simon-simd.cpp +++ b/simon-simd.cpp @@ -52,9 +52,13 @@ using CryptoPP::BlockTransformation; #if defined(CRYPTOPP_ARM_NEON_AVAILABLE) #if defined(CRYPTOPP_LITTLE_ENDIAN) -const word32 s_one64[] = {0, 1<<24, 0, 1<<24}; +const word32 s_zero[] = {0, 0, 0, 0}; +const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; // Only second 8-byte block is incremented after loading +const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; // Routine step. Both 8-byte block are incremented #else -const word32 s_one64[] = {0, 1, 0, 1}; +const word32 s_zero[] = {0, 0, 0, 0}; +const word32 s_one64_1b[] = {0, 0, 0, 1}; +const word32 s_one64_2b[] = {0, 2, 0, 2}; #endif template @@ -125,30 +129,6 @@ inline uint32x4_t SIMON64_f(const uint32x4_t& val) vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val))); } -template -inline word32* Ptr32(T* ptr) -{ - return reinterpret_cast(ptr); -} - -template -inline const word32* Ptr32(const T* ptr) -{ - return reinterpret_cast(ptr); -} - -template -inline word64* Ptr64(T* ptr) -{ - return reinterpret_cast(ptr); -} - -template -inline const word64* Ptr64(const T* ptr) -{ - return reinterpret_cast(ptr); -} - inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0, const word32 *subkeys, unsigned int rounds) { @@ -388,25 +368,40 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6, if (flags & BlockTransformation::BT_AllowParallel) { + // Load these magic values once. Analysis claims be1 and be2 + // may be uninitialized, but they are when the block is a ctr. + uint32x4_t be1, be2; + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + be1 = vld1q_u32(s_one64_1b); + be2 = vld1q_u32(s_one64_2b); + } + while (length >= 6*neonBlockSize) { uint32x4_t block0, block1, block2, block3, block4, block5; - block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - if (flags & BlockTransformation::BT_InBlockIsCounter) { - const uint32x4_t be1 = vld1q_u32(s_one64); - block1 = vaddq_u32(block0, be1); - block2 = vaddq_u32(block1, be1); - block3 = vaddq_u32(block2, be1); - block4 = vaddq_u32(block3, be1); - block5 = vaddq_u32(block4, be1); - vst1q_u8(const_cast(inBlocks), - vreinterpretq_u8_u32(vaddq_u32(block5, be1))); + // For 64-bit block ciphers we need to load the initial single CTR block. + // After the dup load we have two counters in the XMM word. Then we need + // to increment the low ctr by 0 and the high ctr by 1. + block0 = vaddq_u32(be1, vreinterpretq_u32_u64( + vld1q_dup_u64(reinterpret_cast(inBlocks)))); + + // After initial increment of {0,1} remaining counters increment by {1,1}. + block1 = vaddq_u32(be2, block0); + block2 = vaddq_u32(be2, block1); + block3 = vaddq_u32(be2, block2); + block4 = vaddq_u32(be2, block3); + block5 = vaddq_u32(be2, block4); + + vst1_u8(const_cast(inBlocks), vget_low_u8( + vreinterpretq_u8_u32(vaddq_u32(be2, block5)))); } else { const int inc = static_cast(inIncrement); + block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc)); block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc)); block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc)); @@ -456,18 +451,24 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6, while (length >= 2*neonBlockSize) { uint32x4_t block0, block1; - block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - if (flags & BlockTransformation::BT_InBlockIsCounter) { - const uint32x4_t be1 = vld1q_u32(s_one64); - block1 = vaddq_u32(block0, be1); - vst1q_u8(const_cast(inBlocks), - vreinterpretq_u8_u32(vaddq_u32(block1, be1))); + // For 64-bit block ciphers we need to load the initial single CTR block. + // After the dup load we have two counters in the XMM word. Then we need + // to increment the low ctr by 0 and the high ctr by 1. + block0 = vaddq_u32(be1, vreinterpretq_u32_u64( + vld1q_dup_u64(reinterpret_cast(inBlocks)))); + + // After initial increment of {0,1} remaining counters increment by {1,1}. + block1 = vaddq_u32(be2, block0); + + vst1_u8(const_cast(inBlocks), vget_low_u8( + vreinterpretq_u8_u32(vaddq_u32(be2, block1)))); } else { const int inc = static_cast(inIncrement); + block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc)); inBlocks += 2*inc; } @@ -521,16 +522,14 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6, while (length >= blockSize) { - uint32x4_t block, zero = {0,0,0,0}; - block = vsetq_lane_u32(Ptr32(inBlocks)[0], block, 0); - block = vsetq_lane_u32(Ptr32(inBlocks)[1], block, 1); + uint32x4_t zero = vld1q_u32(s_zero); + uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64( + reinterpret_cast(inBlocks))); if (flags & BlockTransformation::BT_XorInput) { - uint32x4_t x; - x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0); - x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1); - block = veorq_u32(block, x); + block = veorq_u32(block, vreinterpretq_u32_u64( + vld1q_dup_u64(reinterpret_cast(xorBlocks)))); } if (flags & BlockTransformation::BT_InBlockIsCounter) @@ -540,16 +539,12 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { - uint32x4_t x; - x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0); - x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1); - block = veorq_u32(block, x); + block = veorq_u32(block, vreinterpretq_u32_u64( + vld1q_dup_u64(reinterpret_cast(xorBlocks)))); } - word32 t[2]; - t[0] = vgetq_lane_u32(block, 0); - t[1] = vgetq_lane_u32(block, 1); - std::memcpy(outBlocks, t, sizeof(t)); + vst1_u8(const_cast(outBlocks), + vget_low_u8(vreinterpretq_u8_u32(block))); inBlocks += inIncrement; outBlocks += outIncrement; @@ -1762,7 +1757,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, if (flags & BlockTransformation::BT_AllowParallel) { - // Load these magic value once. Analysis claims be1 and be2 + // Load these magic values once. Analysis claims be1 and be2 // may be uninitialized, but they are when the block is a ctr. __m128i be1, be2; if (flags & BlockTransformation::BT_InBlockIsCounter) @@ -1782,7 +1777,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, block0 = _mm_add_epi32(be1, _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(inBlocks)))); - // After initial increment both counters increment by 1. + // After initial increment of {0,1} remaining counters increment by {1,1}. block1 = _mm_add_epi32(be2, block0); block2 = _mm_add_epi32(be2, block1); block3 = _mm_add_epi32(be2, block2); @@ -1872,7 +1867,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, block0 = _mm_add_epi32(be1, _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(inBlocks)))); - // After initial increment both counters increment by 1. + // After initial increment of {0,1} remaining counters increment by {1,1}. block1 = _mm_add_epi32(be2, block0); // Store the next counter. diff --git a/speck-simd.cpp b/speck-simd.cpp index 0ab5d403..03afc2a0 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -50,9 +50,13 @@ using CryptoPP::BlockTransformation; #if defined(CRYPTOPP_ARM_NEON_AVAILABLE) #if defined(CRYPTOPP_LITTLE_ENDIAN) -const word32 s_one64[] = {0, 1<<24, 0, 2<<24}; +const word32 s_zero[] = {0, 0, 0, 0}; +const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; // Only second 8-byte block is incremented after loading +const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; // Routine step. Both 8-byte block are incremented #else -const word32 s_one64[] = {0, 2, 0, 1}; +const word32 s_zero[] = {0, 0, 0, 0}; +const word32 s_one64_1b[] = {0, 0, 0, 1}; +const word32 s_one64_2b[] = {0, 2, 0, 2}; #endif template @@ -117,30 +121,6 @@ inline uint32x4_t Shuffle32(const uint32x4_t& val) #endif } -template -inline word32* Ptr32(T* ptr) -{ - return reinterpret_cast(ptr); -} - -template -inline const word32* Ptr32(const T* ptr) -{ - return reinterpret_cast(ptr); -} - -template -inline word64* Ptr64(T* ptr) -{ - return reinterpret_cast(ptr); -} - -template -inline const word64* Ptr64(const T* ptr) -{ - return reinterpret_cast(ptr); -} - inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1, const word32 *subkeys, unsigned int rounds) { @@ -360,25 +340,40 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6, if (flags & BlockTransformation::BT_AllowParallel) { + // Load these magic values once. Analysis claims be1 and be2 + // may be uninitialized, but they are when the block is a ctr. + uint32x4_t be1, be2; + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + be1 = vld1q_u32(s_one64_1b); + be2 = vld1q_u32(s_one64_2b); + } + while (length >= 6*neonBlockSize) { uint32x4_t block0, block1, block2, block3, block4, block5; - block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - if (flags & BlockTransformation::BT_InBlockIsCounter) { - const uint32x4_t be1 = vld1q_u32(s_one64); - block1 = vaddq_u32(block0, be1); - block2 = vaddq_u32(block1, be1); - block3 = vaddq_u32(block2, be1); - block4 = vaddq_u32(block3, be1); - block5 = vaddq_u32(block4, be1); - vst1q_u8(const_cast(inBlocks), - vreinterpretq_u8_u32(vaddq_u32(block5, be1))); + // For 64-bit block ciphers we need to load the initial single CTR block. + // After the dup load we have two counters in the XMM word. Then we need + // to increment the low ctr by 0 and the high ctr by 1. + block0 = vaddq_u32(be1, vreinterpretq_u32_u64( + vld1q_dup_u64(reinterpret_cast(inBlocks)))); + + // After initial increment of {0,1} remaining counters increment by {1,1}. + block1 = vaddq_u32(be2, block0); + block2 = vaddq_u32(be2, block1); + block3 = vaddq_u32(be2, block2); + block4 = vaddq_u32(be2, block3); + block5 = vaddq_u32(be2, block4); + + vst1_u8(const_cast(inBlocks), vget_low_u8( + vreinterpretq_u8_u32(vaddq_u32(be2, block5)))); } else { const int inc = static_cast(inIncrement); + block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc)); block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc)); block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc)); @@ -428,18 +423,24 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6, while (length >= 2*neonBlockSize) { uint32x4_t block0, block1; - block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); - if (flags & BlockTransformation::BT_InBlockIsCounter) { - const uint32x4_t be1 = vld1q_u32(s_one64); - block1 = vaddq_u32(block0, be1); - vst1q_u8(const_cast(inBlocks), - vreinterpretq_u8_u32(vaddq_u32(block1, be1))); + // For 64-bit block ciphers we need to load the initial single CTR block. + // After the dup load we have two counters in the XMM word. Then we need + // to increment the low ctr by 0 and the high ctr by 1. + block0 = vaddq_u32(be1, vreinterpretq_u32_u64( + vld1q_dup_u64(reinterpret_cast(inBlocks)))); + + // After initial increment of {0,1} remaining counters increment by {1,1}. + block1 = vaddq_u32(be2, block0); + + vst1_u8(const_cast(inBlocks), vget_low_u8( + vreinterpretq_u8_u32(vaddq_u32(be2, block1)))); } else { const int inc = static_cast(inIncrement); + block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc)); inBlocks += 2*inc; } @@ -493,16 +494,14 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6, while (length >= blockSize) { - uint32x4_t block, zero = {0,0,0,0}; - block = vsetq_lane_u32(Ptr32(inBlocks)[0], block, 0); - block = vsetq_lane_u32(Ptr32(inBlocks)[1], block, 1); + uint32x4_t zero = vld1q_u32(s_zero); + uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64( + reinterpret_cast(inBlocks))); if (flags & BlockTransformation::BT_XorInput) { - uint32x4_t x; - x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0); - x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1); - block = veorq_u32(block, x); + block = veorq_u32(block, vreinterpretq_u32_u64( + vld1q_dup_u64(reinterpret_cast(xorBlocks)))); } if (flags & BlockTransformation::BT_InBlockIsCounter) @@ -512,16 +511,12 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { - uint32x4_t x; - x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0); - x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1); - block = veorq_u32(block, x); + block = veorq_u32(block, vreinterpretq_u32_u64( + vld1q_dup_u64(reinterpret_cast(xorBlocks)))); } - word32 t[2]; - t[0] = vgetq_lane_u32(block, 0); - t[1] = vgetq_lane_u32(block, 1); - std::memcpy(outBlocks, t, sizeof(t)); + vst1_u8(const_cast(outBlocks), + vget_low_u8(vreinterpretq_u8_u32(block))); inBlocks += inIncrement; outBlocks += outIncrement; @@ -1658,7 +1653,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, if (flags & BlockTransformation::BT_AllowParallel) { - // Load these magic value once. Analysis claims be1 and be2 + // Load these magic values once. Analysis claims be1 and be2 // may be uninitialized, but they are when the block is a ctr. __m128i be1, be2; if (flags & BlockTransformation::BT_InBlockIsCounter) @@ -1678,7 +1673,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, block0 = _mm_add_epi32(be1, _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(inBlocks)))); - // After initial increment both counters increment by 1. + // After initial increment of {0,1} remaining counters increment by {1,1}. block1 = _mm_add_epi32(be2, block0); block2 = _mm_add_epi32(be2, block1); block3 = _mm_add_epi32(be2, block2); @@ -1768,7 +1763,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, block0 = _mm_add_epi32(be1, _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(inBlocks)))); - // After initial increment both counters increment by 1. + // After initial increment of {0,1} remaining counters increment by {1,1}. block1 = _mm_add_epi32(be2, block0); // Store the next counter.