From f5784c1634a346ed6afbaaef51ce2c30507f61b5 Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Wed, 22 Nov 2017 17:35:59 -0500
Subject: [PATCH] Update comments

---
 speck-simd.cpp | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)
diff --git a/speck-simd.cpp b/speck-simd.cpp
index 43d081f9..ea32f6d2 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -65,7 +65,8 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
     // Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a
     // __m128i. We can't SSE over them, so we rearrange the data to allow packed operations.
     // Its also easier to permute them in SPECK128_Enc_Block rather than the calling code.
-    // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy.
+    // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. The zero block below is a
+    // "don't care". It is present so we can vectorize SPECK128_Enc_Block.
     __m128i block1 = _mm_setzero_si128();
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@@ -76,11 +77,12 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
 
     for (size_t i=0; static_cast<int>(i)<rounds; ++i)
     {
-        const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i)));
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
 
         x1 = RotateRight64<8>(x1);
         x1 = _mm_add_epi64(x1, y1);
-        x1 = _mm_xor_si128(x1, k1);
+        x1 = _mm_xor_si128(x1, rk);
         y1 = RotateLeft64<3>(y1);
         y1 = _mm_xor_si128(y1, x1);
     }
@@ -89,7 +91,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
     y1 = _mm_shuffle_epi8(y1, mask);
 
     block0 = _mm_unpacklo_epi64(x1, y1);
-    block1 = _mm_unpackhi_epi64(x1, y1);
+    // block1 = _mm_unpackhi_epi64(x1, y1);
 }
 
 inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
@@ -112,14 +114,15 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
 
     for (size_t i=0; static_cast<int>(i)<rounds; ++i)
     {
-        const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i)));
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
 
         x1 = RotateRight64<8>(x1);
         x2 = RotateRight64<8>(x2);
         x1 = _mm_add_epi64(x1, y1);
         x2 = _mm_add_epi64(x2, y2);
-        x1 = _mm_xor_si128(x1, k1);
-        x2 = _mm_xor_si128(x2, k1);
+        x1 = _mm_xor_si128(x1, rk);
+        x2 = _mm_xor_si128(x2, rk);
         y1 = RotateLeft64<3>(y1);
         y2 = RotateLeft64<3>(y2);
         y1 = _mm_xor_si128(y1, x1);
@@ -142,7 +145,8 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
     // Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a
     // __m128i. We can't SSE over them, so we rearrange the data to allow packed operations.
     // Its also easier to permute them in SPECK128_Dec_Block rather than the calling code.
-    // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy.
+    // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. The zero block below is a
+    // "don't care". It is present so we can vectorize SPECK128_Dec_Block.
     __m128i block1 = _mm_setzero_si128();
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@@ -153,11 +157,12 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
 
     for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
     {
-        const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i)));
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
 
         y1 = _mm_xor_si128(y1, x1);
         y1 = RotateRight64<3>(y1);
-        x1 = _mm_xor_si128(x1, k1);
+        x1 = _mm_xor_si128(x1, rk);
         x1 = _mm_sub_epi64(x1, y1);
         x1 = RotateLeft64<8>(x1);
     }
@@ -166,7 +171,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
     y1 = _mm_shuffle_epi8(y1, mask);
 
     block0 = _mm_unpacklo_epi64(x1, y1);
-    block1 = _mm_unpackhi_epi64(x1, y1);
+    // block1 = _mm_unpackhi_epi64(x1, y1);
 }
 
 inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
@@ -189,14 +194,15 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
 
     for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
     {
-        const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i)));
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
 
         y1 = _mm_xor_si128(y1, x1);
         y2 = _mm_xor_si128(y2, x2);
         y1 = RotateRight64<3>(y1);
         y2 = RotateRight64<3>(y2);
-        x1 = _mm_xor_si128(x1, k1);
-        x2 = _mm_xor_si128(x2, k1);
+        x1 = _mm_xor_si128(x1, rk);
+        x2 = _mm_xor_si128(x2, rk);
         x1 = _mm_sub_epi64(x1, y1);
         x2 = _mm_sub_epi64(x2, y2);
         x1 = RotateLeft64<8>(x1);