Re-add Simon and Speck, enable NEON and Aarch64 (GH #585)

This commit re-adds Simon and Speck. The commit includes NEON, Aarch32 and Aarch64
2024-11-26 19:30:21 +00:00 · 2018-02-19 04:47:19 -05:00 · 2018-02-19 04:47:19 -05:00 · e5a362c026
commit e5a362c026
parent 5da795bf56
4 changed files with 236 additions and 332 deletions
--- a/simon-simd.cpp
+++ b/simon-simd.cpp
@ -64,6 +64,24 @@ using CryptoPP::vec_swap;  // SunCC

 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)

+template <class T>
+inline T UnpackHigh32(const T& a, const T& b)
+{
+    const uint32x2_t x(vget_high_u32((uint32x4_t)a));
+    const uint32x2_t y(vget_high_u32((uint32x4_t)b));
+    const uint32x2x2_t r = vzip_u32(x, y);
+    return (T)vcombine_u32(r.val[0], r.val[1]);
+}
+
+template <class T>
+inline T UnpackLow32(const T& a, const T& b)
+{
+    const uint32x2_t x(vget_low_u32((uint32x4_t)a));
+    const uint32x2_t y(vget_low_u32((uint32x4_t)b));
+    const uint32x2x2_t r = vzip_u32(x, y);
+    return (T)vcombine_u32(r.val[0], r.val[1]);
+}
+
 template <unsigned int R>
 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
 {
@ -114,16 +132,6 @@ inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
 }
 #endif

-inline uint32x4_t Shuffle32(const uint32x4_t& val)
-{
-#if defined(CRYPTOPP_LITTLE_ENDIAN)
-    return vreinterpretq_u32_u8(
-        vrev32q_u8(vreinterpretq_u8_u32(val)));
-#else
-    return val;
-#endif
-}
-
 inline uint32x4_t SIMON64_f(const uint32x4_t& val)
 {
    return veorq_u32(RotateLeft32<2>(val),
@ -133,15 +141,13 @@ inline uint32x4_t SIMON64_f(const uint32x4_t& val)
 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. If only a single block is available then
    // a Zero block is provided to promote vectorizations.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];

    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
    {
@ -160,25 +166,21 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
        std::swap(x1, y1);
    }

-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-
    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
 }

 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. If only a single block is available then
    // a Zero block is provided to promote vectorizations.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];

    if (rounds & 1)
    {
@ -198,32 +200,26 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
    }

-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-
    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
 }

 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. If only a single block is available then
    // a Zero block is provided to promote vectorizations.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];

    for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
    {
@ -248,38 +244,30 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
    }

-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
-
    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
-    block2 = vzipq_u32(x2, y2).val[0];
-    block3 = vzipq_u32(x2, y2).val[1];
-    block4 = vzipq_u32(x3, y3).val[0];
-    block5 = vzipq_u32(x3, y3).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
+    block2 = UnpackLow32(y2, x2);
+    block3 = UnpackHigh32(y2, x2);
+    block4 = UnpackLow32(y3, x3);
+    block5 = UnpackHigh32(y3, x3);
 }

 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. If only a single block is available then
    // a Zero block is provided to promote vectorizations.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];

    if (rounds & 1)
    {
@ -305,17 +293,13 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
    }

-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
-
    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
-    block2 = vzipq_u32(x2, y2).val[0];
-    block3 = vzipq_u32(x2, y2).val[1];
-    block4 = vzipq_u32(x3, y3).val[0];
-    block5 = vzipq_u32(x3, y3).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
+    block2 = UnpackLow32(y2, x2);
+    block3 = UnpackHigh32(y2, x2);
+    block4 = UnpackLow32(y3, x3);
+    block5 = UnpackHigh32(y3, x3);
 }

 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
@ -388,16 +372,6 @@ inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
 }
 #endif

-inline uint64x2_t Shuffle64(const uint64x2_t& val)
-{
-#if defined(CRYPTOPP_LITTLE_ENDIAN)
-    return vreinterpretq_u64_u8(
-        vrev64q_u8(vreinterpretq_u8_u64(val)));
-#else
-    return val;
-#endif
-}
-
 inline uint64x2_t SIMON128_f(const uint64x2_t& val)
 {
    return veorq_u64(RotateLeft64<2>(val),
@ -407,14 +381,12 @@ inline uint64x2_t SIMON128_f(const uint64x2_t& val)
 inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);

    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
    {
@ -433,30 +405,25 @@ inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
        std::swap(x1, y1);
    }

-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
 }

 inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
    uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-    uint64x2_t x2 = UnpackLow64(block2, block3);
-    uint64x2_t y2 = UnpackHigh64(block2, block3);
-    uint64x2_t x3 = UnpackLow64(block4, block5);
-    uint64x2_t y3 = UnpackHigh64(block4, block5);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
+    uint64x2_t x2 = UnpackHigh64(block2, block3);
+    uint64x2_t y2 = UnpackLow64(block2, block3);
+    uint64x2_t x3 = UnpackHigh64(block4, block5);
+    uint64x2_t y3 = UnpackLow64(block4, block5);

    for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
    {
@ -481,29 +448,24 @@ inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
    }

-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
-
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
-    block2 = UnpackLow64(x2, y2);
-    block3 = UnpackHigh64(x2, y2);
-    block4 = UnpackLow64(x3, y3);
-    block5 = UnpackHigh64(x3, y3);
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
+    block2 = UnpackLow64(y2, x2);
+    block3 = UnpackHigh64(y2, x2);
+    block4 = UnpackLow64(y3, x3);
+    block5 = UnpackHigh64(y3, x3);
 }

 inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);

    if (rounds & 1)
    {
@ -523,30 +485,25 @@ inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
        y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
    }

-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
 }

 inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
    uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-    uint64x2_t x2 = UnpackLow64(block2, block3);
-    uint64x2_t y2 = UnpackHigh64(block2, block3);
-    uint64x2_t x3 = UnpackLow64(block4, block5);
-    uint64x2_t y3 = UnpackHigh64(block4, block5);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
+    uint64x2_t x2 = UnpackHigh64(block2, block3);
+    uint64x2_t y2 = UnpackLow64(block2, block3);
+    uint64x2_t x3 = UnpackHigh64(block4, block5);
+    uint64x2_t y3 = UnpackLow64(block4, block5);

    if (rounds & 1)
    {
@ -572,16 +529,13 @@ inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
        y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
    }

-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
-
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
-    block2 = UnpackLow64(x2, y2);
-    block3 = UnpackHigh64(x2, y2);
-    block4 = UnpackLow64(x3, y3);
-    block5 = UnpackHigh64(x3, y3);
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
+    block2 = UnpackLow64(y2, x2);
+    block3 = UnpackHigh64(y2, x2);
+    block4 = UnpackLow64(y3, x3);
+    block5 = UnpackHigh64(y3, x3);
 }

 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
@ -670,8 +624,8 @@ inline __m128i SIMON128_f(const __m128i& v)
 inline void GCC_NO_UBSAN SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@ -706,8 +660,8 @@ inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@ -754,8 +708,8 @@ inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
 inline void GCC_NO_UBSAN SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@ -791,8 +745,8 @@ inline void GCC_NO_UBSAN SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@ -881,8 +835,8 @@ inline __m128i SIMON64_f(const __m128i& v)
 inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. Thanks to Peter Cordes for help with the
    // SSE permutes below.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@ -916,8 +870,8 @@ inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
 inline void GCC_NO_UBSAN SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. Thanks to Peter Cordes for help with the
    // SSE permutes below.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@ -953,8 +907,8 @@ inline void GCC_NO_UBSAN SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. Thanks to Peter Cordes for help with the
    // SSE permutes below.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@ -1009,8 +963,8 @@ inline void GCC_NO_UBSAN SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. Thanks to Peter Cordes for help with the
    // SSE permutes below.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
--- a/simon.h
+++ b/simon.h
@ -17,11 +17,11 @@
 #include "seckey.h"
 #include "secblock.h"

-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1
 #endif

-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS 1
 #endif

--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@ -61,6 +61,24 @@ using CryptoPP::word64;

 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)

+template <class T>
+inline T UnpackHigh32(const T& a, const T& b)
+{
+    const uint32x2_t x(vget_high_u32((uint32x4_t)a));
+    const uint32x2_t y(vget_high_u32((uint32x4_t)b));
+    const uint32x2x2_t r = vzip_u32(x, y);
+    return (T)vcombine_u32(r.val[0], r.val[1]);
+}
+
+template <class T>
+inline T UnpackLow32(const T& a, const T& b)
+{
+    const uint32x2_t x(vget_low_u32((uint32x4_t)a));
+    const uint32x2_t y(vget_low_u32((uint32x4_t)b));
+    const uint32x2x2_t r = vzip_u32(x, y);
+    return (T)vcombine_u32(r.val[0], r.val[1]);
+}
+
 template <unsigned int R>
 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
 {
@ -111,27 +129,15 @@ inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
 }
 #endif  // Aarch32 or Aarch64

-inline uint32x4_t Shuffle32(const uint32x4_t& val)
-{
-#if defined(CRYPTOPP_LITTLE_ENDIAN)
-    return vreinterpretq_u32_u8(
-        vrev32q_u8(vreinterpretq_u8_u32(val)));
-#else
-    return val;
-#endif
-}
-
 inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];

    for (int i=0; i < static_cast<int>(rounds); ++i)
    {
@ -144,24 +150,20 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
        y1 = veorq_u32(y1, x1);
    }

-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-
    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
 }

 inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];

    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
    {
@ -174,32 +176,26 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
        x1 = RotateLeft32<8>(x1);
    }

-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-
    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
 }

 inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. If only a single block is available then
    // a Zero block is provided to promote vectorizations.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];

    for (int i=0; i < static_cast<int>(rounds); ++i)
    {
@ -222,38 +218,30 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
        y3 = veorq_u32(y3, x3);
    }

-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
-
    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
-    block2 = vzipq_u32(x2, y2).val[0];
-    block3 = vzipq_u32(x2, y2).val[1];
-    block4 = vzipq_u32(x3, y3).val[0];
-    block5 = vzipq_u32(x3, y3).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
+    block2 = UnpackLow32(y2, x2);
+    block3 = UnpackHigh32(y2, x2);
+    block4 = UnpackLow32(y3, x3);
+    block5 = UnpackHigh32(y3, x3);
 }

 inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. If only a single block is available then
    // a Zero block is provided to promote vectorizations.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];

    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
    {
@ -276,17 +264,13 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
        x3 = RotateLeft32<8>(x3);
    }

-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
-
    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
-    block2 = vzipq_u32(x2, y2).val[0];
-    block3 = vzipq_u32(x2, y2).val[1];
-    block4 = vzipq_u32(x3, y3).val[0];
-    block5 = vzipq_u32(x3, y3).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
+    block2 = UnpackLow32(y2, x2);
+    block3 = UnpackHigh32(y2, x2);
+    block4 = UnpackLow32(y3, x3);
+    block5 = UnpackHigh32(y3, x3);
 }

 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
@ -359,27 +343,15 @@ inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
 }
 #endif

-inline uint64x2_t Shuffle64(const uint64x2_t& val)
-{
-#if defined(CRYPTOPP_LITTLE_ENDIAN)
-    return vreinterpretq_u64_u8(
-        vrev64q_u8(vreinterpretq_u8_u64(val)));
-#else
-    return val;
-#endif
-}
-
 inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);

    for (int i=0; i < static_cast<int>(rounds); ++i)
    {
@ -392,31 +364,25 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
        y1 = veorq_u64(y1, x1);
    }

-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-
    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
 }

 inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
    uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-    uint64x2_t x2 = UnpackLow64(block2, block3);
-    uint64x2_t y2 = UnpackHigh64(block2, block3);
-    uint64x2_t x3 = UnpackLow64(block4, block5);
-    uint64x2_t y3 = UnpackHigh64(block4, block5);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
+    uint64x2_t x2 = UnpackHigh64(block2, block3);
+    uint64x2_t y2 = UnpackLow64(block2, block3);
+    uint64x2_t x3 = UnpackHigh64(block4, block5);
+    uint64x2_t y3 = UnpackLow64(block4, block5);

    for (int i=0; i < static_cast<int>(rounds); ++i)
    {
@ -439,30 +405,24 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
        y3 = veorq_u64(y3, x3);
    }

-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
-
    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
-    block2 = UnpackLow64(x2, y2);
-    block3 = UnpackHigh64(x2, y2);
-    block4 = UnpackLow64(x3, y3);
-    block5 = UnpackHigh64(x3, y3);
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
+    block2 = UnpackLow64(y2, x2);
+    block3 = UnpackHigh64(y2, x2);
+    block4 = UnpackLow64(y3, x3);
+    block5 = UnpackHigh64(y3, x3);
 }

 inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);

    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
    {
@ -475,31 +435,25 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
        x1 = RotateLeft64<8>(x1);
    }

-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-
    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
 }

 inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
    uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-    uint64x2_t x2 = UnpackLow64(block2, block3);
-    uint64x2_t y2 = UnpackHigh64(block2, block3);
-    uint64x2_t x3 = UnpackLow64(block4, block5);
-    uint64x2_t y3 = UnpackHigh64(block4, block5);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
+    uint64x2_t x2 = UnpackHigh64(block2, block3);
+    uint64x2_t y2 = UnpackLow64(block2, block3);
+    uint64x2_t x3 = UnpackHigh64(block4, block5);
+    uint64x2_t y3 = UnpackLow64(block4, block5);

    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
    {
@ -522,17 +476,13 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
        x3 = RotateLeft64<8>(x3);
    }

-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
-
    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
-    block2 = UnpackLow64(x2, y2);
-    block3 = UnpackHigh64(x2, y2);
-    block4 = UnpackLow64(x3, y3);
-    block5 = UnpackHigh64(x3, y3);
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
+    block2 = UnpackLow64(y2, x2);
+    block3 = UnpackHigh64(y2, x2);
+    block4 = UnpackLow64(y3, x3);
+    block5 = UnpackHigh64(y3, x3);
 }

 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
@ -605,8 +555,8 @@ inline __m128i RotateRight64<8>(const __m128i& val)
 inline void GCC_NO_UBSAN SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@ -633,8 +583,8 @@ inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@ -678,8 +628,8 @@ inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
 inline void GCC_NO_UBSAN SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@ -706,8 +656,8 @@ inline void GCC_NO_UBSAN SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
    const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following.
    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@ -785,8 +735,8 @@ inline __m128i RotateRight32<8>(const __m128i& val)
 inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. Thanks to Peter Cordes for help with the
    // SSE permutes below.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@ -815,8 +765,8 @@ inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
 inline void GCC_NO_UBSAN SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. Thanks to Peter Cordes for help with the
    // SSE permutes below.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@ -846,8 +796,8 @@ inline void GCC_NO_UBSAN SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. Thanks to Peter Cordes for help with the
    // SSE permutes below.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@ -901,8 +851,8 @@ inline void GCC_NO_UBSAN SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
    const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
    // be permuted to the following. Thanks to Peter Cordes for help with the
    // SSE permutes below.
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
--- a/speck.h
+++ b/speck.h
@ -17,11 +17,11 @@
 #include "seckey.h"
 #include "secblock.h"

-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1
 #endif

-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS 1
 #endif