Remove 64-bit AdvancedProcessBlocks (GH #945)

2025-02-18 20:37:42 +00:00 · 2020-07-07 15:22:09 -04:00 · 2020-07-07 15:22:09 -04:00 · dd7598e638
commit dd7598e638
parent 84ab419029
19 changed files with 25 additions and 4130 deletions
--- a/Filelist.txt
+++ b/Filelist.txt
@ -334,10 +334,8 @@ simple.cpp
 simple.h
 siphash.h
 simeck.cpp
-simeck_simd.cpp
 simeck.h
 simon.cpp
-simon64_simd.cpp
 simon128_simd.cpp
 simon.h
 skipjack.cpp
@ -351,7 +349,6 @@ smartptr.h
 sosemanuk.cpp
 sosemanuk.h
 speck.cpp
-speck64_simd.cpp
 speck128_simd.cpp
 speck.h
 square.cpp
--- a/24
+++ b/24
@ -292,7 +292,6 @@ ifeq ($(DETECT_FEATURES),1)
    CHAM_FLAG = $(SSSE3_FLAG)
    KECCAK_FLAG = $(SSSE3_FLAG)
    LEA_FLAG = $(SSSE3_FLAG)
-    SIMECK_FLAG = $(SSSE3_FLAG)
    SIMON128_FLAG = $(SSSE3_FLAG)
    SPECK128_FLAG = $(SSSE3_FLAG)
    SUN_LDFLAGS += $(SSSE3_FLAG)
@ -306,8 +305,6 @@ ifeq ($(DETECT_FEATURES),1)
  ifeq ($(strip $(HAVE_OPT)),0)
    BLAKE2B_FLAG = $(SSE41_FLAG)
    BLAKE2S_FLAG = $(SSE41_FLAG)
-    SIMON64_FLAG = $(SSE41_FLAG)
-    SPECK64_FLAG = $(SSE41_FLAG)
    SUN_LDFLAGS += $(SSE41_FLAG)
  else
    SSE41_FLAG =
@ -478,10 +475,7 @@ ifeq ($(DETECT_FEATURES),1)
    CHAM_FLAG = -march=armv7-a -mfpu=neon
    LEA_FLAG = -march=armv7-a -mfpu=neon
    SHA_FLAG = -march=armv7-a -mfpu=neon
-    SIMECK_FLAG = -march=armv7-a -mfpu=neon
-    SIMON64_FLAG = -march=armv7-a -mfpu=neon
    SIMON128_FLAG = -march=armv7-a -mfpu=neon
-    SPECK64_FLAG = -march=armv7-a -mfpu=neon
    SPECK128_FLAG = -march=armv7-a -mfpu=neon
    SM4_FLAG = -march=armv7-a -mfpu=neon
  else
@ -521,10 +515,7 @@ ifeq ($(DETECT_FEATURES),1)
    CHAM_FLAG = -march=armv8-a
    LEA_FLAG = -march=armv8-a
    NEON_FLAG = -march=armv8-a
-    SIMECK_FLAG = -march=armv8-a
-    SIMON64_FLAG = -march=armv8-a
    SIMON128_FLAG = -march=armv8-a
-    SPECK64_FLAG = -march=armv8-a
    SPECK128_FLAG = -march=armv8-a
    SM4_FLAG = -march=armv8-a
  else
@ -658,7 +649,6 @@ ifeq ($(DETECT_FEATURES),1)
    LEA_FLAG = $(POWER8_FLAG)
    SHA_FLAG = $(POWER8_FLAG)
    SHACAL2_FLAG = $(POWER8_FLAG)
-    SIMECK_FLAG = $(POWER8_FLAG)
  else
    POWER8_FLAG =
  endif
@ -724,8 +714,6 @@ ifeq ($(DETECT_FEATURES),1)
  ifneq ($(ALTIVEC_FLAG),)
    BLAKE2S_FLAG = $(ALTIVEC_FLAG)
    CHACHA_FLAG = $(ALTIVEC_FLAG)
-    SIMON64_FLAG = $(ALTIVEC_FLAG)
-    SPECK64_FLAG = $(ALTIVEC_FLAG)
    SPECK128_FLAG = $(ALTIVEC_FLAG)
    SIMON128_FLAG = $(ALTIVEC_FLAG)
  endif
@ -1612,22 +1600,10 @@ sha3_simd.o : sha3_simd.cpp
 shacal2_simd.o : shacal2_simd.cpp
 	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SHA_FLAG) -c) $<

-# SSSE3 or NEON available
-simeck_simd.o : simeck_simd.cpp
-	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMECK_FLAG) -c) $<
-
-# SSE4.1, NEON or POWER7 available
-simon64_simd.o : simon64_simd.cpp
-	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON64_FLAG) -c) $<
-
 # SSSE3, NEON or POWER8 available
 simon128_simd.o : simon128_simd.cpp
 	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON128_FLAG) -c) $<

-# SSE4.1, NEON or POWER7 available
-speck64_simd.o : speck64_simd.cpp
-	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK64_FLAG) -c) $<
-
 # SSSE3, NEON or POWER8 available
 speck128_simd.o : speck128_simd.cpp
 	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK128_FLAG) -c) $<
--- a/21
+++ b/21
@ -241,7 +241,6 @@ ifeq ($(DETECT_FEATURES),1)
    ARIA_FLAG = $(SSSE3_FLAG)
    CHAM_FLAG = $(SSSE3_FLAG)
    LEA_FLAG = $(SSSE3_FLAG)
-    SIMECK_FLAG = $(SSSE3_FLAG)
    SIMON128_FLAG = $(SSSE3_FLAG)
    SPECK128_FLAG = $(SSSE3_FLAG)
  else
@ -254,8 +253,6 @@ ifeq ($(DETECT_FEATURES),1)
  ifeq ($(strip $(HAVE_OPT)),0)
    BLAKE2B_FLAG = $(SSE41_FLAG)
    BLAKE2S_FLAG = $(SSE41_FLAG)
-    SIMON64_FLAG = $(SSE41_FLAG)
-    SPECK64_FLAG = $(SSE41_FLAG)
  else
    SSE41_FLAG =
  endif
@ -400,10 +397,7 @@ ifeq ($(DETECT_FEATURES),1)
    CHAM_FLAG = $(NEON_FLAG)
    LEA_FLAG = $(NEON_FLAG)
    SHA_FLAG = $(NEON_FLAG)
-    SIMECK_FLAG = $(NEON_FLAG)
-    SIMON64_FLAG = $(NEON_FLAG)
    SIMON128_FLAG = $(NEON_FLAG)
-    SPECK64_FLAG = $(NEON_FLAG)
    SPECK128_FLAG = $(NEON_FLAG)
    SM4_FLAG = $(NEON_FLAG)
  else
@ -457,10 +451,7 @@ ifeq ($(DETECT_FEATURES),1)
    CHAM_FLAG = $(ASIMD_FLAG)
    LEA_FLAG = $(ASIMD_FLAG)
    NEON_FLAG = $(ASIMD_FLAG)
-    SIMECK_FLAG = $(ASIMD_FLAG)
-    SIMON64_FLAG = $(ASIMD_FLAG)
    SIMON128_FLAG = $(ASIMD_FLAG)
-    SPECK64_FLAG = $(ASIMD_FLAG)
    SPECK128_FLAG = $(ASIMD_FLAG)
    SM4_FLAG = $(ASIMD_FLAG)
  else
@ -933,22 +924,10 @@ sha512_armv4.o : sha512_armv4.S
 shacal2_simd.o : shacal2_simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<

-# SSSE3 or NEON available
-simeck_simd.o : simeck_simd.cpp
-	$(CXX) $(strip $(CXXFLAGS) $(SIMECK_FLAG) -c) $<
-
-# SSE4.1, NEON or POWER7 available
-simon64_simd.o : simon64_simd.cpp
-	$(CXX) $(strip $(CXXFLAGS) $(SIMON64_FLAG) -c) $<
-
 # SSSE3, NEON or POWER8 available
 simon128_simd.o : simon128_simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SIMON128_FLAG) -c) $<

-# SSE4.1, NEON or POWER7 available
-speck64_simd.o : speck64_simd.cpp
-	$(CXX) $(strip $(CXXFLAGS) $(SPECK64_FLAG) -c) $<
-
 # SSSE3, NEON or POWER8 available
 speck128_simd.o : speck128_simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SPECK128_FLAG) -c) $<
--- a/adv_simd.h
+++ b/adv_simd.h
--- a/cham.cpp
+++ b/cham.cpp
@ -96,7 +96,7 @@ ANONYMOUS_NAMESPACE_END

 NAMESPACE_BEGIN(CryptoPP)

-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
 # if (CRYPTOPP_SSSE3_AVAILABLE)
 extern size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -110,11 +110,11 @@ extern size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, siz
 extern size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 # endif  // CRYPTOPP_SSSE3_AVAILABLE
-#endif  // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#endif  // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS

 std::string CHAM64::Base::AlgorithmProvider() const
 {
-#if (CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS)
+#if (CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS)
 # if defined(CRYPTOPP_SSSE3_AVAILABLE)
    if (HasSSSE3())
        return "SSSE3";
@ -336,31 +336,7 @@ void CHAM128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock,
    oblock(m_x[0])(m_x[1])(m_x[2])(m_x[3]);
 }

-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
-size_t CHAM64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3()) {
-        return CHAM64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, 80,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-    }
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t CHAM64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3()) {
-        return CHAM64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, 80,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-    }
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
 size_t CHAM128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
        byte *outBlocks, size_t length, word32 flags) const
 {
@ -386,6 +362,6 @@ size_t CHAM128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor
 # endif  // CRYPTOPP_SSSE3_AVAILABLE
    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
-#endif  // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#endif  // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS

 NAMESPACE_END
--- a/cham.h
+++ b/cham.h
@ -16,18 +16,15 @@
 #include "algparam.h"

 #if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
-# define CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS 1
+# define CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS 1
 #endif

 // Yet another SunStudio/SunCC workaround. Failed self tests
 // in SSE code paths on i386 for SunStudio 12.3 and below.
 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-# undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+# undef CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
 #endif

-// https://github.com/weidai11/cryptopp/issues/945
-#undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_BEGIN(CryptoPP)

 /// \brief CHAM block cipher information
@ -92,10 +89,6 @@ public:
    {
    public:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-
-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
    };

    /// \brief Decryption transformation
@ -106,10 +99,6 @@ public:
    {
    public:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-
-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
    };

    /// \brief CHAM64 encryption
@ -156,7 +145,7 @@ public:
    public:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;

-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
 #endif
    };
@ -170,7 +159,7 @@ public:
    public:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;

-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
 #endif
    };
--- a/cham_simd.cpp
+++ b/cham_simd.cpp
@ -45,600 +45,6 @@ using CryptoPP::word32;

 //////////////////////////////////////////////////////////////////////////

-NAMESPACE_BEGIN(W16)  // CHAM64, 16-bit word size
-
-template <unsigned int R>
-inline __m128i RotateLeft16(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi16(val, R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateRight16(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi16(val, 16-R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
-#endif
-}
-
-template <>
-inline __m128i RotateLeft16<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi16(val, 8);
-#else
-    const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-template <>
-inline __m128i RotateRight16<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi16(val, 16-8);
-#else
-    const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                         const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // Should not be instantiated
-    CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
-    CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
-    CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
-    CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
-    CRYPTOPP_ASSERT(0);
-    return _mm_setzero_si128();
-}
-
-template <>
-inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpacklo_epi16(a, b);
-    const __m128i r2 = _mm_unpacklo_epi16(c, d);
-    const __m128i r3 = _mm_unpacklo_epi16(e, f);
-    const __m128i r4 = _mm_unpacklo_epi16(g, h);
-
-    const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
-    const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpacklo_epi16(a, b);
-    const __m128i r2 = _mm_unpacklo_epi16(c, d);
-    const __m128i r3 = _mm_unpacklo_epi16(e, f);
-    const __m128i r4 = _mm_unpacklo_epi16(g, h);
-
-    const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
-    const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpacklo_epi16(a, b);
-    const __m128i r2 = _mm_unpacklo_epi16(c, d);
-    const __m128i r3 = _mm_unpacklo_epi16(e, f);
-    const __m128i r4 = _mm_unpacklo_epi16(g, h);
-
-    const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
-    const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpacklo_epi16(a, b);
-    const __m128i r2 = _mm_unpacklo_epi16(c, d);
-    const __m128i r3 = _mm_unpacklo_epi16(e, f);
-    const __m128i r4 = _mm_unpacklo_epi16(g, h);
-
-    const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
-    const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpackhi_epi16(a, b);
-    const __m128i r2 = _mm_unpackhi_epi16(c, d);
-    const __m128i r3 = _mm_unpackhi_epi16(e, f);
-    const __m128i r4 = _mm_unpackhi_epi16(g, h);
-
-    const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
-    const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpackhi_epi16(a, b);
-    const __m128i r2 = _mm_unpackhi_epi16(c, d);
-    const __m128i r3 = _mm_unpackhi_epi16(e, f);
-    const __m128i r4 = _mm_unpackhi_epi16(g, h);
-
-    const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
-    const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpackhi_epi16(a, b);
-    const __m128i r2 = _mm_unpackhi_epi16(c, d);
-    const __m128i r3 = _mm_unpackhi_epi16(e, f);
-    const __m128i r4 = _mm_unpackhi_epi16(g, h);
-
-    const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
-    const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpackhi_epi16(a, b);
-    const __m128i r2 = _mm_unpackhi_epi16(c, d);
-    const __m128i r3 = _mm_unpackhi_epi16(e, f);
-    const __m128i r4 = _mm_unpackhi_epi16(g, h);
-
-    const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
-    const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& v)
-{
-    // Should not be instantiated
-    CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
-
-    return _mm_setzero_si128();
-}
-
-template <>
-inline __m128i UnpackXMM<0>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<1>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<2>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
-}
-
-template <>
-inline __m128i UnpackXMM<3>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
-}
-
-template <>
-inline __m128i UnpackXMM<4>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
-}
-
-template <>
-inline __m128i UnpackXMM<5>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
-}
-
-template <>
-inline __m128i UnpackXMM<6>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
-}
-
-template <>
-inline __m128i UnpackXMM<7>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
-}
-
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& a, const __m128i& b)
-{
-    const __m128i& z = _mm_setzero_si128();
-    return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
-}
-
-template <unsigned int IDX>
-inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                         const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
-}
-
-template <unsigned int IDX>
-inline __m128i RepackXMM(const __m128i& v)
-{
-    return UnpackXMM<IDX>(v);
-}
-
-inline void CHAM64_Enc_Block(__m128i &block0,
-    const word16 *subkeys, unsigned int /*rounds*/)
-{
-    // Rearrange the data for vectorization. UnpackXMM includes a
-    // little-endian swap for SSE. Thanks to Peter Cordes for help
-    // with packing and unpacking.
-    // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
-    __m128i a = UnpackXMM<0>(block0);
-    __m128i b = UnpackXMM<1>(block0);
-    __m128i c = UnpackXMM<2>(block0);
-    __m128i d = UnpackXMM<3>(block0);
-    __m128i e = UnpackXMM<4>(block0);
-    __m128i f = UnpackXMM<5>(block0);
-    __m128i g = UnpackXMM<6>(block0);
-    __m128i h = UnpackXMM<7>(block0);
-
-    const unsigned int rounds = 80;
-    __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
-    __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
-
-    const unsigned int MASK = 15;
-    for (int i=0; i<static_cast<int>(rounds); i+=4)
-    {
-        __m128i k, kr, t1, t2, t3, t4;
-        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
-
-        // Shuffle out key
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
-
-        t1 = _mm_xor_si128(a, counter);
-        t3 = _mm_xor_si128(e, counter);
-        t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
-        a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
-        e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
-
-        t1 = _mm_xor_si128(b, counter);
-        t3 = _mm_xor_si128(f, counter);
-        t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
-        b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
-        f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
-
-        t1 = _mm_xor_si128(c, counter);
-        t3 = _mm_xor_si128(g, counter);
-        t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
-        c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
-        g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
-
-        t1 = _mm_xor_si128(d, counter);
-        t3 = _mm_xor_si128(h, counter);
-        t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
-        d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
-        h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-    }
-
-    // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
-    block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
-}
-
-inline void CHAM64_Dec_Block(__m128i &block0,
-    const word16 *subkeys, unsigned int /*rounds*/)
-{
-    // Rearrange the data for vectorization. UnpackXMM includes a
-    // little-endian swap for SSE. Thanks to Peter Cordes for help
-    // with packing and unpacking.
-    // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
-    __m128i a = UnpackXMM<0>(block0);
-    __m128i b = UnpackXMM<1>(block0);
-    __m128i c = UnpackXMM<2>(block0);
-    __m128i d = UnpackXMM<3>(block0);
-    __m128i e = UnpackXMM<4>(block0);
-    __m128i f = UnpackXMM<5>(block0);
-    __m128i g = UnpackXMM<6>(block0);
-    __m128i h = UnpackXMM<7>(block0);
-
-    const unsigned int rounds = 80;
-    __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
-    __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
-
-    const unsigned int MASK = 15;
-    for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
-    {
-        __m128i k, kr, t1, t2, t3, t4;
-        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
-
-        // Shuffle out key
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
-
-        // Odd round
-        t1 = RotateRight16<1>(d);
-        t3 = RotateRight16<1>(h);
-        t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
-        d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
-
-        // Even round
-        t1 = RotateRight16<8>(c);
-        t3 = RotateRight16<8>(g);
-        t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
-        c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
-
-        // Odd round
-        t1 = RotateRight16<1>(b);
-        t3 = RotateRight16<1>(f);
-        t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
-        b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
-
-        // Even round
-        t1 = RotateRight16<8>(a);
-        t3 = RotateRight16<8>(e);
-        t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
-        a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-    }
-
-    // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
-    block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
-}
-
-inline void CHAM64_Enc_2_Blocks(__m128i &block0,
-    __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
-{
-    // Rearrange the data for vectorization. UnpackXMM includes a
-    // little-endian swap for SSE. Thanks to Peter Cordes for help
-    // with packing and unpacking.
-    // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
-    __m128i a = UnpackXMM<0>(block0, block1);
-    __m128i b = UnpackXMM<1>(block0, block1);
-    __m128i c = UnpackXMM<2>(block0, block1);
-    __m128i d = UnpackXMM<3>(block0, block1);
-    __m128i e = UnpackXMM<4>(block0, block1);
-    __m128i f = UnpackXMM<5>(block0, block1);
-    __m128i g = UnpackXMM<6>(block0, block1);
-    __m128i h = UnpackXMM<7>(block0, block1);
-
-    const unsigned int rounds = 80;
-    __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
-    __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
-
-    const unsigned int MASK = 15;
-    for (int i=0; i<static_cast<int>(rounds); i+=4)
-    {
-        __m128i k, kr, t1, t2, t3, t4;
-        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
-
-        // Shuffle out key
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
-
-        t1 = _mm_xor_si128(a, counter);
-        t3 = _mm_xor_si128(e, counter);
-        t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
-        a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
-        e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
-
-        t1 = _mm_xor_si128(b, counter);
-        t3 = _mm_xor_si128(f, counter);
-        t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
-        b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
-        f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
-
-        t1 = _mm_xor_si128(c, counter);
-        t3 = _mm_xor_si128(g, counter);
-        t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
-        c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
-        g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
-
-        t1 = _mm_xor_si128(d, counter);
-        t3 = _mm_xor_si128(h, counter);
-        t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
-        d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
-        h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-    }
-
-    // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
-    block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
-    block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
-}
-
-inline void CHAM64_Dec_2_Blocks(__m128i &block0,
-    __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
-{
-    // Rearrange the data for vectorization. UnpackXMM includes a
-    // little-endian swap for SSE. Thanks to Peter Cordes for help
-    // with packing and unpacking.
-    // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
-    __m128i a = UnpackXMM<0>(block0, block1);
-    __m128i b = UnpackXMM<1>(block0, block1);
-    __m128i c = UnpackXMM<2>(block0, block1);
-    __m128i d = UnpackXMM<3>(block0, block1);
-    __m128i e = UnpackXMM<4>(block0, block1);
-    __m128i f = UnpackXMM<5>(block0, block1);
-    __m128i g = UnpackXMM<6>(block0, block1);
-    __m128i h = UnpackXMM<7>(block0, block1);
-
-    const unsigned int rounds = 80;
-    __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
-    __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
-
-    const unsigned int MASK = 15;
-    for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
-    {
-        __m128i k, kr, t1, t2, t3, t4;
-        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
-
-        // Shuffle out key
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
-
-        // Odd round
-        t1 = RotateRight16<1>(d);
-        t3 = RotateRight16<1>(h);
-        t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
-        d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
-
-        // Even round
-        t1 = RotateRight16<8>(c);
-        t3 = RotateRight16<8>(g);
-        t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
-        c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
-
-        // Odd round
-        t1 = RotateRight16<1>(b);
-        t3 = RotateRight16<1>(f);
-        t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
-        b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
-
-        // Even round
-        t1 = RotateRight16<8>(a);
-        t3 = RotateRight16<8>(e);
-        t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
-        a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-    }
-
-    // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
-    block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
-    block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
-}
-
-NAMESPACE_END  // W16
-
-//////////////////////////////////////////////////////////////////////////
-
 NAMESPACE_BEGIN(W32)  // CHAM128, 32-bit word size

 template <unsigned int R>
@ -1054,20 +460,6 @@ ANONYMOUS_NAMESPACE_END
 NAMESPACE_BEGIN(CryptoPP)

 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
-size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
 size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
--- a/cryptest.nmake
+++ b/cryptest.nmake
@ -78,9 +78,9 @@ LIB_SRCS = \
    rdtables.cpp rijndael.cpp rijndael_simd.cpp ripemd.cpp rng.cpp rsa.cpp \
    rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp \
    sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp shake.cpp \
-    shark.cpp sharkbox.cpp simeck.cpp simeck_simd.cpp simon.cpp \
-    simon128_simd.cpp simon64_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \
-    sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp speck64_simd.cpp \
+    shark.cpp sharkbox.cpp simeck.cpp simon.cpp \
+    simon128_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \
+    sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp \
    square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \
    threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \
    vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp \
@ -109,9 +109,9 @@ LIB_OBJS = \
    rdtables.obj rijndael.obj rijndael_simd.obj ripemd.obj rng.obj rsa.obj \
    rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj \
    sha.obj sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj shake.obj \
-    shark.obj sharkbox.obj simeck.obj simeck_simd.obj simon.obj \
-    simon128_simd.obj simon64_simd.obj skipjack.obj sm3.obj sm4.obj \
-    sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj speck64_simd.obj \
+    shark.obj sharkbox.obj simeck.obj simon.obj \
+    simon128_simd.obj skipjack.obj sm3.obj sm4.obj \
+    sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj \
    square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \
    threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \
    vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj \
--- a/cryptlib.vcxproj
+++ b/cryptlib.vcxproj
@ -315,9 +315,7 @@
    <ClCompile Include="shark.cpp" />
    <ClCompile Include="sharkbox.cpp" />
    <ClCompile Include="simeck.cpp" />
-    <ClCompile Include="simeck_simd.cpp" />
    <ClCompile Include="simon.cpp" />
-    <ClCompile Include="simon64_simd.cpp" />
    <ClCompile Include="simon128_simd.cpp" />
    <ClCompile Include="simple.cpp" />
    <ClCompile Include="skipjack.cpp" />
@ -326,7 +324,6 @@
    <ClCompile Include="sm4_simd.cpp" />
    <ClCompile Include="sosemanuk.cpp" />
    <ClCompile Include="speck.cpp" />
-    <ClCompile Include="speck64_simd.cpp" />
    <ClCompile Include="speck128_simd.cpp" />
    <ClCompile Include="square.cpp" />
    <ClCompile Include="squaretb.cpp" />
--- a/cryptlib.vcxproj.filters
+++ b/cryptlib.vcxproj.filters
@ -425,15 +425,9 @@
    <ClCompile Include="simeck.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="simeck_simd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="simon.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="simon64_simd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="simon128_simd.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -455,9 +449,6 @@
    <ClCompile Include="speck.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="speck64_simd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="speck128_simd.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
--- a/simeck.cpp
+++ b/simeck.cpp
@ -33,16 +33,6 @@ ANONYMOUS_NAMESPACE_END

 NAMESPACE_BEGIN(CryptoPP)

-#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-extern size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-#endif  // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-
 std::string SIMECK32::Base::AlgorithmProvider() const
 {
    return "C++";
@ -104,10 +94,6 @@ void SIMECK32::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock

 std::string SIMECK64::Base::AlgorithmProvider() const
 {
-#if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3())
-        return "SSSE3";
-#endif
    return "C++";
 }

@ -165,30 +151,4 @@ void SIMECK64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
    oblock(m_t[0])(m_t[1]);
 }

-#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-size_t SIMECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3()) {
-        return SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-    }
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3()) {
-        return SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-    }
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif  // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_END
--- a/simeck.h
+++ b/simeck.h
@ -17,19 +17,6 @@
 #include "secblock.h"
 #include "algparam.h"

-#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
-# define CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS 1
-#endif
-
-// Yet another SunStudio/SunCC workaround. Failed self tests
-// in SSE code paths on i386 for SunStudio 12.3 and below.
-#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-# undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-#endif
-
-// https://github.com/weidai11/cryptopp/issues/945
-#undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_BEGIN(CryptoPP)

 /// \brief SIMECK block cipher information
--- a/simeck_simd.cpp
+++ b/simeck_simd.cpp
@ -1,342 +0,0 @@
-// simeck_simd.cpp - written and placed in the public domain by Gangqiang Yang and Jeffrey Walton.
-//
-//    This source file uses intrinsics and built-ins to gain access to
-//    SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
-//    source file is needed because additional CXXFLAGS are required to enable
-//    the appropriate instructions sets in some build configurations.
-
-#include "pch.h"
-#include "config.h"
-
-#include "simeck.h"
-#include "misc.h"
-
-// Uncomment for benchmarking C++ against SSE or NEON.
-// Do so in both simon.cpp and simon_simd.cpp.
-// #undef CRYPTOPP_SSSE3_AVAILABLE
-// #undef CRYPTOPP_ARM_NEON_AVAILABLE
-
-#if (CRYPTOPP_SSSE3_AVAILABLE)
-# include "adv_simd.h"
-# include <pmmintrin.h>
-# include <tmmintrin.h>
-#endif
-
-#if defined(__XOP__)
-# include <ammintrin.h>
-# if defined(__GNUC__)
-#  include <x86intrin.h>
-# endif
-#endif
-
-// Squash MS LNK4221 and libtool warnings
-extern const char SIMECK_SIMD_FNAME[] = __FILE__;
-
-// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
-#define M128_CAST(x) ((__m128i *)(void *)(x))
-#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
-
-ANONYMOUS_NAMESPACE_BEGIN
-
-using CryptoPP::word16;
-using CryptoPP::word32;
-
-#if (CRYPTOPP_SSSE3_AVAILABLE)
-
-//////////////////////////////////////////////////////////////////////////
-
-template <unsigned int R>
-inline __m128i RotateLeft32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateRight32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline __m128i RotateLeft32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 8);
-#else
-    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline __m128i RotateRight32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-8);
-#else
-    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-/// \brief Unpack XMM words
-/// \tparam IDX the element from each XMM word
-/// \param a the first XMM word
-/// \param b the second XMM word
-/// \param c the third XMM word
-/// \param d the fourth XMM word
-/// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation
-///   equivalent to <tt>a[IDX] || b[IDX] || c[IDX] || d[IDX]</tt>.
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    // Should not be instantiated
-    CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
-    CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
-    CRYPTOPP_ASSERT(0);
-    return _mm_setzero_si128();
-}
-
-template <>
-inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    const __m128i r1 = _mm_unpacklo_epi32(a, b);
-    const __m128i r2 = _mm_unpacklo_epi32(c, d);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
-        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    const __m128i r1 = _mm_unpacklo_epi32(a, b);
-    const __m128i r2 = _mm_unpacklo_epi32(c, d);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
-        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    const __m128i r1 = _mm_unpackhi_epi32(a, b);
-    const __m128i r2 = _mm_unpackhi_epi32(c, d);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
-        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    const __m128i r1 = _mm_unpackhi_epi32(a, b);
-    const __m128i r2 = _mm_unpackhi_epi32(c, d);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
-        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
-}
-
-/// \brief Unpack a XMM word
-/// \tparam IDX the element from each XMM word
-/// \param v the first XMM word
-/// \details UnpackXMM selects the IDX element from v and returns a concatenation
-///   equivalent to <tt>v[IDX] || v[IDX] || v[IDX] || v[IDX]</tt>.
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& v)
-{
-    // Should not be instantiated
-    CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
-    return _mm_setzero_si128();
-}
-
-template <>
-inline __m128i UnpackXMM<0>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<1>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
-}
-
-template <>
-inline __m128i UnpackXMM<2>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
-}
-
-template <>
-inline __m128i UnpackXMM<3>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
-}
-
-template <unsigned int IDX>
-inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    return UnpackXMM<IDX>(a, b, c, d);
-}
-
-template <unsigned int IDX>
-inline __m128i RepackXMM(const __m128i& v)
-{
-    return UnpackXMM<IDX>(v);
-}
-
-inline void SIMECK64_Encrypt(__m128i &a, __m128i &b, __m128i &c, __m128i &d, const __m128i key)
-{
-    // SunStudio 12.3 workaround
-    __m128i s, t; s = a; t = c;
-    a = _mm_xor_si128(_mm_and_si128(a, RotateLeft32<5>(a)), RotateLeft32<1>(a));
-    c = _mm_xor_si128(_mm_and_si128(c, RotateLeft32<5>(c)), RotateLeft32<1>(c));
-    a = _mm_xor_si128(a, _mm_xor_si128(b, key));
-    c = _mm_xor_si128(c, _mm_xor_si128(d, key));
-    b = s; d = t;
-}
-
-inline void SIMECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
-    __m128i a = UnpackXMM<0>(block0);
-    __m128i b = UnpackXMM<1>(block0);
-    __m128i c = UnpackXMM<2>(block0);
-    __m128i d = UnpackXMM<3>(block0);
-
-    const unsigned int rounds = 44;
-    for (int i = 0; i < static_cast<int>(rounds); i += 4)
-    {
-        const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
-    }
-
-    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
-    block0 = RepackXMM<0>(a,b,c,d);
-}
-
-inline void SIMECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/)
-{
-    // SIMECK requires a word swap for the decryption transform
-    __m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1));
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
-    __m128i a = UnpackXMM<0>(w);
-    __m128i b = UnpackXMM<1>(w);
-    __m128i c = UnpackXMM<2>(w);
-    __m128i d = UnpackXMM<3>(w);
-
-    const unsigned int rounds = 44;
-    for (int i = static_cast<int>(rounds)-1; i >= 0; i -= 4)
-    {
-        const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
-    }
-
-    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
-    w = RepackXMM<0>(a,b,c,d);
-
-    block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
-}
-
-inline void SIMECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
-    __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
-    __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
-    __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
-    __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
-
-    const unsigned int rounds = 44;
-    for (int i = 0; i < static_cast<int>(rounds); i += 4)
-    {
-        const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
-    }
-
-    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
-    block0 = RepackXMM<0>(a, b, c, d);
-    block1 = RepackXMM<1>(a, b, c, d);
-    block2 = RepackXMM<2>(a, b, c, d);
-    block3 = RepackXMM<3>(a, b, c, d);
-}
-
-inline void SIMECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
-{
-    // SIMECK requires a word swap for the decryption transform
-    __m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1));
-    __m128i x = _mm_shuffle_epi32(block1, _MM_SHUFFLE(2, 3, 0, 1));
-    __m128i y = _mm_shuffle_epi32(block2, _MM_SHUFFLE(2, 3, 0, 1));
-    __m128i z = _mm_shuffle_epi32(block3, _MM_SHUFFLE(2, 3, 0, 1));
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
-    __m128i a = UnpackXMM<0>(w, x, y, z);
-    __m128i b = UnpackXMM<1>(w, x, y, z);
-    __m128i c = UnpackXMM<2>(w, x, y, z);
-    __m128i d = UnpackXMM<3>(w, x, y, z);
-
-    const unsigned int rounds = 44;
-    for (int i = static_cast<int>(rounds)-1; i >= 0; i -= 4)
-    {
-        const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
-    }
-
-    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
-    w = RepackXMM<0>(a, b, c, d);
-    x = RepackXMM<1>(a, b, c, d);
-    y = RepackXMM<2>(a, b, c, d);
-    z = RepackXMM<3>(a, b, c, d);
-
-    block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
-    block1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
-    block2 = _mm_shuffle_epi32(y, _MM_SHUFFLE(2, 3, 0, 1));
-    block3 = _mm_shuffle_epi32(z, _MM_SHUFFLE(2, 3, 0, 1));
-}
-
-#endif  // CRYPTOPP_SSSE3_AVAILABLE
-
-ANONYMOUS_NAMESPACE_END
-
-NAMESPACE_BEGIN(CryptoPP)
-
-#if defined(CRYPTOPP_SSSE3_AVAILABLE)
-size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Enc_Block, SIMECK64_Enc_4_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Dec_Block, SIMECK64_Dec_4_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif // CRYPTOPP_SSSE3_AVAILABLE
-
-NAMESPACE_END
--- a/simon.cpp
+++ b/simon.cpp
@ -196,14 +196,6 @@ ANONYMOUS_NAMESPACE_END

 NAMESPACE_BEGIN(CryptoPP)

-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-extern size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-#endif
-
 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
 extern size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -212,14 +204,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, siz
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif

-#if (CRYPTOPP_SSE41_AVAILABLE)
-extern size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-#endif
-
 #if (CRYPTOPP_SSSE3_AVAILABLE)
 extern size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -228,14 +212,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif

-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-extern size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-#endif
-
 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
 extern size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -246,39 +222,11 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys,

 std::string SIMON64::Base::AlgorithmProvider() const
 {
-#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
-# if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return "SSE4.1";
-# endif
-# if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return "NEON";
-# endif
-# if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return "Altivec";
-# endif
-#endif
    return "C++";
 }

 unsigned int SIMON64::Base::OptimalDataAlignment() const
 {
-#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
-# if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return 16;  // load __m128i
-# endif
-# if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return 4;  // load uint32x4_t
-# endif
-# if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return 16;  // load uint32x4_p
-# endif
-#endif
    return GetAlignmentOf<word32>();
 }

@ -311,29 +259,6 @@ void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
    default:
        CRYPTOPP_ASSERT(0);
    }
-
-#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-
-    // Pre-splat the round keys for Altivec forward transformation
-#if CRYPTOPP_ALTIVEC_AVAILABLE
-    if (IsForwardTransformation() && HasAltivec())
-    {
-        AlignedSecBlock presplat(m_rkeys.size()*4);
-        for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
-            presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
-        m_rkeys.swap(presplat);
-    }
-#elif CRYPTOPP_SSE41_AVAILABLE
-    if (IsForwardTransformation() && HasSSE41())
-    {
-        AlignedSecBlock presplat(m_rkeys.size()*4);
-        for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
-            presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
-        m_rkeys.swap(presplat);
-    }
-#endif
-
-#endif  // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
 }

 void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -478,7 +403,7 @@ void SIMON128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength
    }
 #endif

-#endif  // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
+#endif  // CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
 }

 void SIMON128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -533,50 +458,6 @@ void SIMON128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
 }

-#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
-size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-#if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return SIMON64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-#if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return SIMON64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif  // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-
 #if (CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS)
 size_t SIMON128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
        byte *outBlocks, size_t length, word32 flags) const
--- a/simon.h
+++ b/simon.h
@ -17,14 +17,6 @@
 #include "seckey.h"
 #include "secblock.h"

-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
-    CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
-    CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
-# ifndef CRYPTOPP_DISABLE_SIMON_SIMD
-#  define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1
-# endif
-#endif
-
 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
    CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
    CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
@ -36,13 +28,9 @@
 // Yet another SunStudio/SunCC workaround. Failed self tests
 // in SSE code paths on i386 for SunStudio 12.3 and below.
 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-# undef CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
 # undef CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
 #endif

-// https://github.com/weidai11/cryptopp/issues/945
-#undef CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_BEGIN(CryptoPP)

 /// \brief SIMON block cipher information
@ -129,9 +117,6 @@ public:
    {
    public:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
    };

    /// \brief SIMON64 decryption transformation
@ -142,9 +127,6 @@ public:
    {
    public:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
    };

    typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
--- a/simon64_simd.cpp
+++ b/simon64_simd.cpp
@ -1,864 +0,0 @@
-// simon_simd.cpp - written and placed in the public domain by Jeffrey Walton
-//
-//    This source file uses intrinsics and built-ins to gain access to
-//    SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
-//    source file is needed because additional CXXFLAGS are required to enable
-//    the appropriate instructions sets in some build configurations.
-
-#include "pch.h"
-#include "config.h"
-
-#include "simon.h"
-#include "misc.h"
-
-// Uncomment for benchmarking C++ against SSE or NEON.
-// Do so in both simon.cpp and simon_simd.cpp.
-// #undef CRYPTOPP_SSE41_AVAILABLE
-// #undef CRYPTOPP_ARM_NEON_AVAILABLE
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-# include "adv_simd.h"
-# include <pmmintrin.h>
-# include <tmmintrin.h>
-# include <smmintrin.h>
-#endif
-
-#if defined(__XOP__)
-# include <ammintrin.h>
-# if defined(__GNUC__)
-#  include <x86intrin.h>
-# endif
-#endif
-
-#if (CRYPTOPP_ARM_NEON_HEADER)
-# include "adv_simd.h"
-# include <arm_neon.h>
-#endif
-
-#if (CRYPTOPP_ARM_ACLE_HEADER)
-# include <stdint.h>
-# include <arm_acle.h>
-#endif
-
-#if defined(_M_ARM64)
-# include "adv_simd.h"
-#endif
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-# include "adv_simd.h"
-# include "ppc_simd.h"
-#endif
-
-// Squash MS LNK4221 and libtool warnings
-extern const char SIMON64_SIMD_FNAME[] = __FILE__;
-
-ANONYMOUS_NAMESPACE_BEGIN
-
-using CryptoPP::byte;
-using CryptoPP::word32;
-using CryptoPP::word64;
-using CryptoPP::vec_swap;  // SunCC
-
-// *************************** ARM NEON ************************** //
-
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-
-template <class T>
-inline T UnpackHigh32(const T& a, const T& b)
-{
-    const uint32x2_t x(vget_high_u32((uint32x4_t)a));
-    const uint32x2_t y(vget_high_u32((uint32x4_t)b));
-    const uint32x2x2_t r = vzip_u32(x, y);
-    return (T)vcombine_u32(r.val[0], r.val[1]);
-}
-
-template <class T>
-inline T UnpackLow32(const T& a, const T& b)
-{
-    const uint32x2_t x(vget_low_u32((uint32x4_t)a));
-    const uint32x2_t y(vget_low_u32((uint32x4_t)b));
-    const uint32x2x2_t r = vzip_u32(x, y);
-    return (T)vcombine_u32(r.val[0], r.val[1]);
-}
-
-template <unsigned int R>
-inline uint32x4_t RotateLeft32(const uint32x4_t& val)
-{
-    const uint32x4_t a(vshlq_n_u32(val, R));
-    const uint32x4_t b(vshrq_n_u32(val, 32 - R));
-    return vorrq_u32(a, b);
-}
-
-template <unsigned int R>
-inline uint32x4_t RotateRight32(const uint32x4_t& val)
-{
-    const uint32x4_t a(vshlq_n_u32(val, 32 - R));
-    const uint32x4_t b(vshrq_n_u32(val, R));
-    return vorrq_u32(a, b);
-}
-
-#if defined(__aarch32__) || defined(__aarch64__)
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
-{
-    const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
-    const uint8x16_t mask = vld1q_u8(maskb);
-
-    return vreinterpretq_u32_u8(
-        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
-{
-    const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
-    const uint8x16_t mask = vld1q_u8(maskb);
-
-    return vreinterpretq_u32_u8(
-        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
-}
-#endif
-
-inline uint32x4_t SIMON64_f(const uint32x4_t& val)
-{
-    return veorq_u32(RotateLeft32<2>(val),
-        vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
-}
-
-inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
-
-        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
-        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
-
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
-        std::swap(x1, y1);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-}
-
-inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-
-    if (rounds & 1)
-    {
-        std::swap(x1, y1);
-        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
-
-        y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
-        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
-
-        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-}
-
-inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
-    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
-    {
-        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
-        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
-        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
-
-        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
-        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
-        x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
-        x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
-
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
-        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
-        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
-        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-    block2 = UnpackLow32(y2, x2);
-    block3 = UnpackHigh32(y2, x2);
-    block4 = UnpackLow32(y3, x3);
-    block5 = UnpackHigh32(y3, x3);
-}
-
-inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
-    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
-
-    if (rounds & 1)
-    {
-        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
-        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
-
-        y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
-        y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
-        y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-        const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
-        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
-        x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
-        x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
-
-        const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
-        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
-        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-    block2 = UnpackLow32(y2, x2);
-    block3 = UnpackHigh32(y2, x2);
-    block4 = UnpackLow32(y3, x3);
-    block5 = UnpackHigh32(y3, x3);
-}
-
-#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
-
-// ***************************** IA-32 ***************************** //
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-
-// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
-#ifndef M128_CAST
-# define M128_CAST(x) ((__m128i *)(void *)(x))
-#endif
-#ifndef CONST_M128_CAST
-# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
-#endif
-
-inline void Swap128(__m128i& a,__m128i& b)
-{
-#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-    // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
-    // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
-    vec_swap(a, b);
-#else
-    std::swap(a, b);
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateLeft32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateRight32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-__m128i RotateLeft32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 8);
-#else
-    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-__m128i RotateRight32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-8);
-#else
-    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-inline __m128i SIMON64_f(const __m128i& v)
-{
-    return _mm_xor_si128(RotateLeft32<2>(v),
-        _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
-}
-
-inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
-
-        const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4));
-        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4));
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
-        Swap128(x1, y1);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-}
-
-inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    if (rounds & 1)
-    {
-        Swap128(x1, y1);
-        const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-        const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
-        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
-
-        const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-}
-
-inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t2 = _mm_castsi128_ps(block2);
-    const __m128 t3 = _mm_castsi128_ps(block3);
-    __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t4 = _mm_castsi128_ps(block4);
-    const __m128 t5 = _mm_castsi128_ps(block5);
-    __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
-        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
-        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
-
-        const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4));
-        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
-        x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
-        x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4));
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
-        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
-        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
-        Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-    block2 = _mm_unpacklo_epi32(y2, x2);
-    block3 = _mm_unpackhi_epi32(y2, x2);
-    block4 = _mm_unpacklo_epi32(y3, x3);
-    block5 = _mm_unpackhi_epi32(y3, x3);
-}
-
-inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t2 = _mm_castsi128_ps(block2);
-    const __m128 t3 = _mm_castsi128_ps(block3);
-    __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t4 = _mm_castsi128_ps(block4);
-    const __m128 t5 = _mm_castsi128_ps(block5);
-    __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
-
-    if (rounds & 1)
-    {
-        Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
-        const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
-        y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
-        y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-        const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
-        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
-        x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
-        x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
-
-        const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
-        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
-        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-    block2 = _mm_unpacklo_epi32(y2, x2);
-    block3 = _mm_unpackhi_epi32(y2, x2);
-    block4 = _mm_unpacklo_epi32(y3, x3);
-    block5 = _mm_unpackhi_epi32(y3, x3);
-}
-
-#endif  // CRYPTOPP_SSE41_AVAILABLE
-
-// ***************************** Altivec ***************************** //
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-
-using CryptoPP::uint8x16_p;
-using CryptoPP::uint32x4_p;
-
-using CryptoPP::VecAnd;
-using CryptoPP::VecXor;
-using CryptoPP::VecLoad;
-using CryptoPP::VecLoadAligned;
-using CryptoPP::VecPermute;
-
-// Rotate left by bit count
-template<unsigned int C>
-inline uint32x4_p RotateLeft32(const uint32x4_p val)
-{
-    const uint32x4_p m = {C, C, C, C};
-    return vec_rl(val, m);
-}
-
-// Rotate right by bit count
-template<unsigned int C>
-inline uint32x4_p RotateRight32(const uint32x4_p val)
-{
-    const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
-    return vec_rl(val, m);
-}
-
-inline uint32x4_p SIMON64_f(const uint32x4_p val)
-{
-    return VecXor(RotateLeft32<2>(val),
-        VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
-}
-
-inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = VecPermute(block0, block1, m1);
-    uint32x4_p y1 = VecPermute(block0, block1, m2);
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4);
-        const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
-        x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
-        std::swap(x1, y1);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-}
-
-inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = VecPermute(block0, block1, m1);
-    uint32x4_p y1 = VecPermute(block0, block1, m2);
-
-    if (rounds & 1)
-    {
-        std::swap(x1, y1);
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
-#else
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk = VecLoad(subkeys+rounds-1);
-        rk = VecPermute(rk, rk, m);
-#endif
-        y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
-        const uint32x4_p rk2 = vec_splats(subkeys[i]);
-#else
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk1 = VecLoad(subkeys+i+1);
-        uint32x4_p rk2 = VecLoad(subkeys+i);
-        rk1 = VecPermute(rk1, rk1, m);
-        rk2 = VecPermute(rk2, rk2, m);
-#endif
-        x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-}
-
-inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
-            uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
-            uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
-    uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
-    uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
-    uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
-    uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
-    uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4);
-        const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
-        y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
-        y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
-
-        x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
-        x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
-        x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
-        y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
-        y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
-        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-    block2 = (uint32x4_p)VecPermute(x2, y2, m3);
-    block3 = (uint32x4_p)VecPermute(x2, y2, m4);
-    block4 = (uint32x4_p)VecPermute(x3, y3, m3);
-    block5 = (uint32x4_p)VecPermute(x3, y3, m4);
-}
-
-inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
-            uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
-            uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
-    uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
-    uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
-    uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
-    uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
-    uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
-
-    if (rounds & 1)
-    {
-        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
-#else
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk = VecLoad(subkeys+rounds-1);
-        rk = VecPermute(rk, rk, m);
-#endif
-        y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
-        y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
-        y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
-        const uint32x4_p rk2 = vec_splats(subkeys[i]);
-#else
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk1 = VecLoad(subkeys+i+1);
-        uint32x4_p rk2 = VecLoad(subkeys+i);
-        rk1 = VecPermute(rk1, rk1, m);
-        rk2 = VecPermute(rk2, rk2, m);
-#endif
-        x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
-        x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
-        x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
-        y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
-        y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-    block2 = (uint32x4_p)VecPermute(x2, y2, m3);
-    block3 = (uint32x4_p)VecPermute(x2, y2, m4);
-    block4 = (uint32x4_p)VecPermute(x3, y3, m3);
-    block5 = (uint32x4_p)VecPermute(x3, y3, m4);
-}
-
-#endif  // CRYPTOPP_ALTIVEC_AVAILABLE
-
-ANONYMOUS_NAMESPACE_END
-
-///////////////////////////////////////////////////////////////////////
-
-NAMESPACE_BEGIN(CryptoPP)
-
-// *************************** ARM NEON **************************** //
-
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
-
-// ***************************** IA-32 ***************************** //
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-// ***************************** Altivec ***************************** //
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-NAMESPACE_END
--- a/speck.cpp
+++ b/speck.cpp
@ -171,12 +171,6 @@ ANONYMOUS_NAMESPACE_END
 NAMESPACE_BEGIN(CryptoPP)

 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
-extern size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
 extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);

@ -200,14 +194,6 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif

-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-extern size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-#endif
-
 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
 extern size_t SPECK128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -218,39 +204,11 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys,

 std::string SPECK64::Base::AlgorithmProvider() const
 {
-#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
-# if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return "SSE4.1";
-# endif
-# if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return "NEON";
-# endif
-# if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return "Altivec";
-# endif
-#endif
    return "C++";
 }

 unsigned int SPECK64::Base::OptimalDataAlignment() const
 {
-#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
-# if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return 16;  // load __m128i
-# endif
-# if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return 4;  // load uint32x4_t
-# endif
-# if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return 16;  // load uint32x4_p
-# endif
-#endif
    return GetAlignmentOf<word32>();
 }

@ -283,29 +241,6 @@ void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
    default:
        CRYPTOPP_ASSERT(0);
    }
-
-#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-
-    // Pre-splat the round keys for Altivec forward transformation
-#if CRYPTOPP_ALTIVEC_AVAILABLE
-    if (IsForwardTransformation() && HasAltivec())
-    {
-        AlignedSecBlock presplat(m_rkeys.size()*4);
-        for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
-            presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
-        m_rkeys.swap(presplat);
-    }
-#elif CRYPTOPP_SSE41_AVAILABLE
-    if (IsForwardTransformation() && HasSSE41())
-    {
-        AlignedSecBlock presplat(m_rkeys.size()*4);
-        for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
-            presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
-        m_rkeys.swap(presplat);
-    }
-#endif
-
-#endif  // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
 }

 void SPECK64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -505,50 +440,6 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
 }

-#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
-size_t SPECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-#if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return SPECK64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return SPECK64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SPECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-#if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return SPECK64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return SPECK64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif  // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-
 #if (CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS)
 size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
        byte *outBlocks, size_t length, word32 flags) const
--- a/speck.h
+++ b/speck.h
@ -17,14 +17,6 @@
 #include "seckey.h"
 #include "secblock.h"

-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
-    CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
-    CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
-# ifndef CRYPTOPP_DISABLE_SPECK_SIMD
-#  define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1
-# endif
-#endif
-
 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
    CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
    CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
@ -36,13 +28,9 @@
 // Yet another SunStudio/SunCC workaround. Failed self tests
 // in SSE code paths on i386 for SunStudio 12.3 and below.
 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-# undef CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
 # undef CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS
 #endif

-// https://github.com/weidai11/cryptopp/issues/945
-#undef CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_BEGIN(CryptoPP)

 /// \brief SPECK block cipher information
@ -129,9 +117,6 @@ public:
    {
    public:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
    };

    /// \brief SPECK64 decryption transformation
@ -142,9 +127,6 @@ public:
    {
    public:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
    };

    typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
--- a/speck64_simd.cpp
+++ b/speck64_simd.cpp
@ -1,781 +0,0 @@
-// speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
-//
-//    This source file uses intrinsics and built-ins to gain access to
-//    SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
-//    source file is needed because additional CXXFLAGS are required to enable
-//    the appropriate instructions sets in some build configurations.
-
-#include "pch.h"
-#include "config.h"
-
-#include "speck.h"
-#include "misc.h"
-
-// Uncomment for benchmarking C++ against SSE or NEON.
-// Do so in both speck.cpp and speck_simd.cpp.
-// #undef CRYPTOPP_SSE41_AVAILABLE
-// #undef CRYPTOPP_ARM_NEON_AVAILABLE
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-# include "adv_simd.h"
-# include <pmmintrin.h>
-# include <tmmintrin.h>
-# include <smmintrin.h>
-#endif
-
-#if defined(__XOP__)
-# include <ammintrin.h>
-# if defined(__GNUC__)
-#  include <x86intrin.h>
-# endif
-#endif
-
-#if (CRYPTOPP_ARM_NEON_HEADER)
-# include "adv_simd.h"
-# include <arm_neon.h>
-#endif
-
-#if (CRYPTOPP_ARM_ACLE_HEADER)
-# include <stdint.h>
-# include <arm_acle.h>
-#endif
-
-#if defined(_M_ARM64)
-# include "adv_simd.h"
-#endif
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-# include "adv_simd.h"
-# include "ppc_simd.h"
-#endif
-
-// Squash MS LNK4221 and libtool warnings
-extern const char SPECK64_SIMD_FNAME[] = __FILE__;
-
-ANONYMOUS_NAMESPACE_BEGIN
-
-using CryptoPP::byte;
-using CryptoPP::word32;
-using CryptoPP::word64;
-
-// *************************** ARM NEON ************************** //
-
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-
-template <class T>
-inline T UnpackHigh32(const T& a, const T& b)
-{
-    const uint32x2_t x(vget_high_u32((uint32x4_t)a));
-    const uint32x2_t y(vget_high_u32((uint32x4_t)b));
-    const uint32x2x2_t r = vzip_u32(x, y);
-    return (T)vcombine_u32(r.val[0], r.val[1]);
-}
-
-template <class T>
-inline T UnpackLow32(const T& a, const T& b)
-{
-    const uint32x2_t x(vget_low_u32((uint32x4_t)a));
-    const uint32x2_t y(vget_low_u32((uint32x4_t)b));
-    const uint32x2x2_t r = vzip_u32(x, y);
-    return (T)vcombine_u32(r.val[0], r.val[1]);
-}
-
-template <unsigned int R>
-inline uint32x4_t RotateLeft32(const uint32x4_t& val)
-{
-    const uint32x4_t a(vshlq_n_u32(val, R));
-    const uint32x4_t b(vshrq_n_u32(val, 32 - R));
-    return vorrq_u32(a, b);
-}
-
-template <unsigned int R>
-inline uint32x4_t RotateRight32(const uint32x4_t& val)
-{
-    const uint32x4_t a(vshlq_n_u32(val, 32 - R));
-    const uint32x4_t b(vshrq_n_u32(val, R));
-    return vorrq_u32(a, b);
-}
-
-#if defined(__aarch32__) || defined(__aarch64__)
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
-{
-    const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
-    const uint8x16_t mask = vld1q_u8(maskb);
-
-    return vreinterpretq_u32_u8(
-        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
-{
-    const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
-    const uint8x16_t mask = vld1q_u8(maskb);
-
-    return vreinterpretq_u32_u8(
-        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
-}
-#endif  // Aarch32 or Aarch64
-
-inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
-
-        x1 = RotateRight32<8>(x1);
-        x1 = vaddq_u32(x1, y1);
-        x1 = veorq_u32(x1, rk);
-        y1 = RotateLeft32<3>(y1);
-        y1 = veorq_u32(y1, x1);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-}
-
-inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
-
-        y1 = veorq_u32(y1, x1);
-        y1 = RotateRight32<3>(y1);
-        x1 = veorq_u32(x1, rk);
-        x1 = vsubq_u32(x1, y1);
-        x1 = RotateLeft32<8>(x1);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-}
-
-inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
-    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
-
-        x1 = RotateRight32<8>(x1);
-        x2 = RotateRight32<8>(x2);
-        x3 = RotateRight32<8>(x3);
-        x1 = vaddq_u32(x1, y1);
-        x2 = vaddq_u32(x2, y2);
-        x3 = vaddq_u32(x3, y3);
-        x1 = veorq_u32(x1, rk);
-        x2 = veorq_u32(x2, rk);
-        x3 = veorq_u32(x3, rk);
-        y1 = RotateLeft32<3>(y1);
-        y2 = RotateLeft32<3>(y2);
-        y3 = RotateLeft32<3>(y3);
-        y1 = veorq_u32(y1, x1);
-        y2 = veorq_u32(y2, x2);
-        y3 = veorq_u32(y3, x3);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-    block2 = UnpackLow32(y2, x2);
-    block3 = UnpackHigh32(y2, x2);
-    block4 = UnpackLow32(y3, x3);
-    block5 = UnpackHigh32(y3, x3);
-}
-
-inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
-    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
-
-        y1 = veorq_u32(y1, x1);
-        y2 = veorq_u32(y2, x2);
-        y3 = veorq_u32(y3, x3);
-        y1 = RotateRight32<3>(y1);
-        y2 = RotateRight32<3>(y2);
-        y3 = RotateRight32<3>(y3);
-        x1 = veorq_u32(x1, rk);
-        x2 = veorq_u32(x2, rk);
-        x3 = veorq_u32(x3, rk);
-        x1 = vsubq_u32(x1, y1);
-        x2 = vsubq_u32(x2, y2);
-        x3 = vsubq_u32(x3, y3);
-        x1 = RotateLeft32<8>(x1);
-        x2 = RotateLeft32<8>(x2);
-        x3 = RotateLeft32<8>(x3);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-    block2 = UnpackLow32(y2, x2);
-    block3 = UnpackHigh32(y2, x2);
-    block4 = UnpackLow32(y3, x3);
-    block5 = UnpackHigh32(y3, x3);
-}
-
-#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
-
-// ***************************** IA-32 ***************************** //
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-
-#ifndef M128_CAST
-# define M128_CAST(x) ((__m128i *)(void *)(x))
-#endif
-#ifndef CONST_M128_CAST
-# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
-#endif
-
-template <unsigned int R>
-inline __m128i RotateLeft32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateRight32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-__m128i RotateLeft32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 8);
-#else
-    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-__m128i RotateRight32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-8);
-#else
-    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
-
-        x1 = RotateRight32<8>(x1);
-        x1 = _mm_add_epi32(x1, y1);
-        x1 = _mm_xor_si128(x1, rk);
-        y1 = RotateLeft32<3>(y1);
-        y1 = _mm_xor_si128(y1, x1);
-    }
-
-    // The is roughly the SSE equivalent to ARM vzp32
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-}
-
-inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-        const __m128i rk = _mm_set1_epi32(subkeys[i]);
-
-        y1 = _mm_xor_si128(y1, x1);
-        y1 = RotateRight32<3>(y1);
-        x1 = _mm_xor_si128(x1, rk);
-        x1 = _mm_sub_epi32(x1, y1);
-        x1 = RotateLeft32<8>(x1);
-    }
-
-    // The is roughly the SSE equivalent to ARM vzp32
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-}
-
-inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t2 = _mm_castsi128_ps(block2);
-    const __m128 t3 = _mm_castsi128_ps(block3);
-    __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t4 = _mm_castsi128_ps(block4);
-    const __m128 t5 = _mm_castsi128_ps(block5);
-    __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
-
-        x1 = RotateRight32<8>(x1);
-        x2 = RotateRight32<8>(x2);
-        x3 = RotateRight32<8>(x3);
-        x1 = _mm_add_epi32(x1, y1);
-        x2 = _mm_add_epi32(x2, y2);
-        x3 = _mm_add_epi32(x3, y3);
-        x1 = _mm_xor_si128(x1, rk);
-        x2 = _mm_xor_si128(x2, rk);
-        x3 = _mm_xor_si128(x3, rk);
-        y1 = RotateLeft32<3>(y1);
-        y2 = RotateLeft32<3>(y2);
-        y3 = RotateLeft32<3>(y3);
-        y1 = _mm_xor_si128(y1, x1);
-        y2 = _mm_xor_si128(y2, x2);
-        y3 = _mm_xor_si128(y3, x3);
-    }
-
-    // The is roughly the SSE equivalent to ARM vzp32
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-    block2 = _mm_unpacklo_epi32(y2, x2);
-    block3 = _mm_unpackhi_epi32(y2, x2);
-    block4 = _mm_unpacklo_epi32(y3, x3);
-    block5 = _mm_unpackhi_epi32(y3, x3);
-}
-
-inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t2 = _mm_castsi128_ps(block2);
-    const __m128 t3 = _mm_castsi128_ps(block3);
-    __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t4 = _mm_castsi128_ps(block4);
-    const __m128 t5 = _mm_castsi128_ps(block5);
-    __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-        const __m128i rk = _mm_set1_epi32(subkeys[i]);
-
-        y1 = _mm_xor_si128(y1, x1);
-        y2 = _mm_xor_si128(y2, x2);
-        y3 = _mm_xor_si128(y3, x3);
-        y1 = RotateRight32<3>(y1);
-        y2 = RotateRight32<3>(y2);
-        y3 = RotateRight32<3>(y3);
-        x1 = _mm_xor_si128(x1, rk);
-        x2 = _mm_xor_si128(x2, rk);
-        x3 = _mm_xor_si128(x3, rk);
-        x1 = _mm_sub_epi32(x1, y1);
-        x2 = _mm_sub_epi32(x2, y2);
-        x3 = _mm_sub_epi32(x3, y3);
-        x1 = RotateLeft32<8>(x1);
-        x2 = RotateLeft32<8>(x2);
-        x3 = RotateLeft32<8>(x3);
-    }
-
-    // The is roughly the SSE equivalent to ARM vzp32
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-    block2 = _mm_unpacklo_epi32(y2, x2);
-    block3 = _mm_unpackhi_epi32(y2, x2);
-    block4 = _mm_unpacklo_epi32(y3, x3);
-    block5 = _mm_unpackhi_epi32(y3, x3);
-}
-
-#endif  // CRYPTOPP_SSE41_AVAILABLE
-
-// ***************************** Altivec ***************************** //
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-using CryptoPP::uint8x16_p;
-using CryptoPP::uint32x4_p;
-
-using CryptoPP::VecAdd;
-using CryptoPP::VecSub;
-using CryptoPP::VecXor;
-using CryptoPP::VecLoad;
-using CryptoPP::VecLoadAligned;
-using CryptoPP::VecPermute;
-
-// Rotate left by bit count
-template<unsigned int C>
-inline uint32x4_p RotateLeft32(const uint32x4_p val)
-{
-    const uint32x4_p m = {C, C, C, C};
-    return vec_rl(val, m);
-}
-
-// Rotate right by bit count
-template<unsigned int C>
-inline uint32x4_p RotateRight32(const uint32x4_p val)
-{
-    const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
-    return vec_rl(val, m);
-}
-
-void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
-        const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = VecPermute(block0, block1, m1);
-    uint32x4_p y1 = VecPermute(block0, block1, m2);
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk = VecLoadAligned(subkeys+i*4);
-
-        x1 = RotateRight32<8>(x1);
-        x1 = VecAdd(x1, y1);
-        x1 = VecXor(x1, rk);
-
-        y1 = RotateLeft32<3>(y1);
-        y1 = VecXor(y1, x1);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-}
-
-void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
-        const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = VecPermute(block0, block1, m1);
-    uint32x4_p y1 = VecPermute(block0, block1, m2);
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk = vec_splats(subkeys[i]);
-#else
-        // subkeys has extra elements so memory backs the last subkey
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk = VecLoad(subkeys+i);
-        rk = VecPermute(rk, rk, m);
-#endif
-
-        y1 = VecXor(y1, x1);
-        y1 = RotateRight32<3>(y1);
-
-        x1 = VecXor(x1, rk);
-        x1 = VecSub(x1, y1);
-        x1 = RotateLeft32<8>(x1);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-}
-
-void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
-            uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
-            uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
-    uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
-    uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
-    uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
-    uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
-    uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk = VecLoadAligned(subkeys+i*4);
-
-        x1 = RotateRight32<8>(x1);
-        x2 = RotateRight32<8>(x2);
-        x3 = RotateRight32<8>(x3);
-
-        x1 = VecAdd(x1, y1);
-        x2 = VecAdd(x2, y2);
-        x3 = VecAdd(x3, y3);
-
-        x1 = VecXor(x1, rk);
-        x2 = VecXor(x2, rk);
-        x3 = VecXor(x3, rk);
-
-        y1 = RotateLeft32<3>(y1);
-        y2 = RotateLeft32<3>(y2);
-        y3 = RotateLeft32<3>(y3);
-
-        y1 = VecXor(y1, x1);
-        y2 = VecXor(y2, x2);
-        y3 = VecXor(y3, x3);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-    block2 = (uint32x4_p)VecPermute(x2, y2, m3);
-    block3 = (uint32x4_p)VecPermute(x2, y2, m4);
-    block4 = (uint32x4_p)VecPermute(x3, y3, m3);
-    block5 = (uint32x4_p)VecPermute(x3, y3, m4);
-}
-
-void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
-            uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
-            uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
-    uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
-    uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
-    uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
-    uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
-    uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk = vec_splats(subkeys[i]);
-#else
-        // subkeys has extra elements so memory backs the last subkey
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk = VecLoad(subkeys+i);
-        rk = VecPermute(rk, rk, m);
-#endif
-
-        y1 = VecXor(y1, x1);
-        y2 = VecXor(y2, x2);
-        y3 = VecXor(y3, x3);
-
-        y1 = RotateRight32<3>(y1);
-        y2 = RotateRight32<3>(y2);
-        y3 = RotateRight32<3>(y3);
-
-        x1 = VecXor(x1, rk);
-        x2 = VecXor(x2, rk);
-        x3 = VecXor(x3, rk);
-
-        x1 = VecSub(x1, y1);
-        x2 = VecSub(x2, y2);
-        x3 = VecSub(x3, y3);
-
-        x1 = RotateLeft32<8>(x1);
-        x2 = RotateLeft32<8>(x2);
-        x3 = RotateLeft32<8>(x3);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-    block2 = (uint32x4_p)VecPermute(x2, y2, m3);
-    block3 = (uint32x4_p)VecPermute(x2, y2, m4);
-    block4 = (uint32x4_p)VecPermute(x3, y3, m3);
-    block5 = (uint32x4_p)VecPermute(x3, y3, m4);
-}
-
-#endif  // CRYPTOPP_ALTIVEC_AVAILABLE
-
-ANONYMOUS_NAMESPACE_END
-
-///////////////////////////////////////////////////////////////////////
-
-NAMESPACE_BEGIN(CryptoPP)
-
-// *************************** ARM NEON **************************** //
-
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-// ***************************** IA-32 ***************************** //
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-// ***************************** Altivec ***************************** //
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-NAMESPACE_END