From dd7598e638bba536117de716bae3f738312d4c5a Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Tue, 7 Jul 2020 15:22:09 -0400
Subject: [PATCH] Remove 64-bit AdvancedProcessBlocks (GH #945)

---
 Filelist.txt             |    3 -
 GNUmakefile              |   24 -
 GNUmakefile-cross        |   21 -
 adv_simd.h               | 1116 +-------------------------------------
 cham.cpp                 |   34 +-
 cham.h                   |   19 +-
 cham_simd.cpp            |  608 ---------------------
 cryptest.nmake           |   12 +-
 cryptlib.vcxproj         |    3 -
 cryptlib.vcxproj.filters |    9 -
 simeck.cpp               |   40 --
 simeck.h                 |   13 -
 simeck_simd.cpp          |  342 ------------
 simon.cpp                |  121 +----
 simon.h                  |   18 -
 simon64_simd.cpp         |  864 -----------------------------
 speck.cpp                |  109 ----
 speck.h                  |   18 -
 speck64_simd.cpp         |  781 --------------------------
 19 files changed, 25 insertions(+), 4130 deletions(-)
 delete mode 100644 simeck_simd.cpp
 delete mode 100644 simon64_simd.cpp
 delete mode 100644 speck64_simd.cpp

diff --git a/Filelist.txt b/Filelist.txt
index 06cefca7..8f10b2a8 100644
--- a/Filelist.txt
+++ b/Filelist.txt
@@ -334,10 +334,8 @@ simple.cpp
 simple.h
 siphash.h
 simeck.cpp
-simeck_simd.cpp
 simeck.h
 simon.cpp
-simon64_simd.cpp
 simon128_simd.cpp
 simon.h
 skipjack.cpp
@@ -351,7 +349,6 @@ smartptr.h
 sosemanuk.cpp
 sosemanuk.h
 speck.cpp
-speck64_simd.cpp
 speck128_simd.cpp
 speck.h
 square.cpp
diff --git a/GNUmakefile b/GNUmakefile
index bd636c57..e52d3417 100755
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -292,7 +292,6 @@ ifeq ($(DETECT_FEATURES),1)
     CHAM_FLAG = $(SSSE3_FLAG)
     KECCAK_FLAG = $(SSSE3_FLAG)
     LEA_FLAG = $(SSSE3_FLAG)
-    SIMECK_FLAG = $(SSSE3_FLAG)
     SIMON128_FLAG = $(SSSE3_FLAG)
     SPECK128_FLAG = $(SSSE3_FLAG)
     SUN_LDFLAGS += $(SSSE3_FLAG)
@@ -306,8 +305,6 @@ ifeq ($(DETECT_FEATURES),1)
   ifeq ($(strip $(HAVE_OPT)),0)
     BLAKE2B_FLAG = $(SSE41_FLAG)
     BLAKE2S_FLAG = $(SSE41_FLAG)
-    SIMON64_FLAG = $(SSE41_FLAG)
-    SPECK64_FLAG = $(SSE41_FLAG)
     SUN_LDFLAGS += $(SSE41_FLAG)
   else
     SSE41_FLAG =
@@ -478,10 +475,7 @@ ifeq ($(DETECT_FEATURES),1)
     CHAM_FLAG = -march=armv7-a -mfpu=neon
     LEA_FLAG = -march=armv7-a -mfpu=neon
     SHA_FLAG = -march=armv7-a -mfpu=neon
-    SIMECK_FLAG = -march=armv7-a -mfpu=neon
-    SIMON64_FLAG = -march=armv7-a -mfpu=neon
     SIMON128_FLAG = -march=armv7-a -mfpu=neon
-    SPECK64_FLAG = -march=armv7-a -mfpu=neon
     SPECK128_FLAG = -march=armv7-a -mfpu=neon
     SM4_FLAG = -march=armv7-a -mfpu=neon
   else
@@ -521,10 +515,7 @@ ifeq ($(DETECT_FEATURES),1)
     CHAM_FLAG = -march=armv8-a
     LEA_FLAG = -march=armv8-a
     NEON_FLAG = -march=armv8-a
-    SIMECK_FLAG = -march=armv8-a
-    SIMON64_FLAG = -march=armv8-a
     SIMON128_FLAG = -march=armv8-a
-    SPECK64_FLAG = -march=armv8-a
     SPECK128_FLAG = -march=armv8-a
     SM4_FLAG = -march=armv8-a
   else
@@ -658,7 +649,6 @@ ifeq ($(DETECT_FEATURES),1)
     LEA_FLAG = $(POWER8_FLAG)
     SHA_FLAG = $(POWER8_FLAG)
     SHACAL2_FLAG = $(POWER8_FLAG)
-    SIMECK_FLAG = $(POWER8_FLAG)
   else
     POWER8_FLAG =
   endif
@@ -724,8 +714,6 @@ ifeq ($(DETECT_FEATURES),1)
   ifneq ($(ALTIVEC_FLAG),)
     BLAKE2S_FLAG = $(ALTIVEC_FLAG)
     CHACHA_FLAG = $(ALTIVEC_FLAG)
-    SIMON64_FLAG = $(ALTIVEC_FLAG)
-    SPECK64_FLAG = $(ALTIVEC_FLAG)
     SPECK128_FLAG = $(ALTIVEC_FLAG)
     SIMON128_FLAG = $(ALTIVEC_FLAG)
   endif
@@ -1612,22 +1600,10 @@ sha3_simd.o : sha3_simd.cpp
 shacal2_simd.o : shacal2_simd.cpp
 	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SHA_FLAG) -c) $<
 
-# SSSE3 or NEON available
-simeck_simd.o : simeck_simd.cpp
-	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMECK_FLAG) -c) $<
-
-# SSE4.1, NEON or POWER7 available
-simon64_simd.o : simon64_simd.cpp
-	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON64_FLAG) -c) $<
-
 # SSSE3, NEON or POWER8 available
 simon128_simd.o : simon128_simd.cpp
 	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SIMON128_FLAG) -c) $<
 
-# SSE4.1, NEON or POWER7 available
-speck64_simd.o : speck64_simd.cpp
-	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK64_FLAG) -c) $<
-
 # SSSE3, NEON or POWER8 available
 speck128_simd.o : speck128_simd.cpp
 	$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(SPECK128_FLAG) -c) $<
diff --git a/GNUmakefile-cross b/GNUmakefile-cross
index 97f92f41..13b8970c 100755
--- a/GNUmakefile-cross
+++ b/GNUmakefile-cross
@@ -241,7 +241,6 @@ ifeq ($(DETECT_FEATURES),1)
     ARIA_FLAG = $(SSSE3_FLAG)
     CHAM_FLAG = $(SSSE3_FLAG)
     LEA_FLAG = $(SSSE3_FLAG)
-    SIMECK_FLAG = $(SSSE3_FLAG)
     SIMON128_FLAG = $(SSSE3_FLAG)
     SPECK128_FLAG = $(SSSE3_FLAG)
   else
@@ -254,8 +253,6 @@ ifeq ($(DETECT_FEATURES),1)
   ifeq ($(strip $(HAVE_OPT)),0)
     BLAKE2B_FLAG = $(SSE41_FLAG)
     BLAKE2S_FLAG = $(SSE41_FLAG)
-    SIMON64_FLAG = $(SSE41_FLAG)
-    SPECK64_FLAG = $(SSE41_FLAG)
   else
     SSE41_FLAG =
   endif
@@ -400,10 +397,7 @@ ifeq ($(DETECT_FEATURES),1)
     CHAM_FLAG = $(NEON_FLAG)
     LEA_FLAG = $(NEON_FLAG)
     SHA_FLAG = $(NEON_FLAG)
-    SIMECK_FLAG = $(NEON_FLAG)
-    SIMON64_FLAG = $(NEON_FLAG)
     SIMON128_FLAG = $(NEON_FLAG)
-    SPECK64_FLAG = $(NEON_FLAG)
     SPECK128_FLAG = $(NEON_FLAG)
     SM4_FLAG = $(NEON_FLAG)
   else
@@ -457,10 +451,7 @@ ifeq ($(DETECT_FEATURES),1)
     CHAM_FLAG = $(ASIMD_FLAG)
     LEA_FLAG = $(ASIMD_FLAG)
     NEON_FLAG = $(ASIMD_FLAG)
-    SIMECK_FLAG = $(ASIMD_FLAG)
-    SIMON64_FLAG = $(ASIMD_FLAG)
     SIMON128_FLAG = $(ASIMD_FLAG)
-    SPECK64_FLAG = $(ASIMD_FLAG)
     SPECK128_FLAG = $(ASIMD_FLAG)
     SM4_FLAG = $(ASIMD_FLAG)
   else
@@ -933,22 +924,10 @@ sha512_armv4.o : sha512_armv4.S
 shacal2_simd.o : shacal2_simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
 
-# SSSE3 or NEON available
-simeck_simd.o : simeck_simd.cpp
-	$(CXX) $(strip $(CXXFLAGS) $(SIMECK_FLAG) -c) $<
-
-# SSE4.1, NEON or POWER7 available
-simon64_simd.o : simon64_simd.cpp
-	$(CXX) $(strip $(CXXFLAGS) $(SIMON64_FLAG) -c) $<
-
 # SSSE3, NEON or POWER8 available
 simon128_simd.o : simon128_simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SIMON128_FLAG) -c) $<
 
-# SSE4.1, NEON or POWER7 available
-speck64_simd.o : speck64_simd.cpp
-	$(CXX) $(strip $(CXXFLAGS) $(SPECK64_FLAG) -c) $<
-
 # SSSE3, NEON or POWER8 available
 speck128_simd.o : speck128_simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SPECK128_FLAG) -c) $<
diff --git a/adv_simd.h b/adv_simd.h
index 27e4cd43..134a8210 100644
--- a/adv_simd.h
+++ b/adv_simd.h
@@ -9,27 +9,16 @@
 //    acceleration. After several implementations we noticed a lot of copy and
 //    paste occuring. adv_simd.h provides a template to avoid the copy and paste.
 //
-//    There are 11 templates provided in this file. The number following the
-//    function name, 64 or 128, is the block size. The name following the block
-//    size is the arrangement and acceleration. For example 4x1_SSE means Intel
-//    SSE using two encrypt (or decrypt) functions: one that operates on 4 SIMD
-//    words, and one that operates on 1 SIMD words.
+//    There are 6 templates provided in this file. The number following the
+//    function name, 128, is the block size in bits. The name following the
+//    block size is the arrangement and acceleration. For example 4x1_SSE means
+//    Intel SSE using two encrypt (or decrypt) functions: one that operates on
+//    4 SIMD words, and one that operates on 1 SIMD words.
 //
-//    The distinction between SIMD words versus cipher blocks is important
-//    because 64-bit ciphers use one SIMD word for two cipher blocks. For
-//    example, AdvancedProcessBlocks64_6x2_ALTIVEC operates on 6 and 2 SIMD
-//    words, which is 12 and 4 cipher blocks. The function will do the right
-//    thing even if there is only one 64-bit block to encrypt.
-//
-//      * AdvancedProcessBlocks64_2x1_SSE
-//      * AdvancedProcessBlocks64_4x1_SSE
 //      * AdvancedProcessBlocks128_4x1_SSE
-//      * AdvancedProcessBlocks64_6x2_SSE
 //      * AdvancedProcessBlocks128_6x2_SSE
-//      * AdvancedProcessBlocks64_6x2_NEON
 //      * AdvancedProcessBlocks128_4x1_NEON
 //      * AdvancedProcessBlocks128_6x2_NEON
-//      * AdvancedProcessBlocks64_6x2_ALTIVEC
 //      * AdvancedProcessBlocks128_4x1_ALTIVEC
 //      * AdvancedProcessBlocks128_6x1_ALTIVEC
 //
@@ -41,6 +30,10 @@
 //    The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
 //    of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
 //    results in a failed link due to the const/non-const mismatch.
+//
+//    In July 2020 the library stopped using 64-bit block version of
+//    AdvancedProcessBlocks. Testing showed unreliable results and failed
+//    self tests on occassion.
 
 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
@@ -94,247 +87,6 @@ ANONYMOUS_NAMESPACE_END
 
 NAMESPACE_BEGIN(CryptoPP)
 
-/// \brief AdvancedProcessBlocks for 2 and 6 blocks
-/// \tparam F2 function to process 2 64-bit blocks
-/// \tparam F6 function to process 6 64-bit blocks
-/// \tparam W word type of the subkey table
-/// \details AdvancedProcessBlocks64_6x2_NEON processes 6 and 2 NEON SIMD words
-///  at a time. For a single block the template uses F2 with a zero block.
-/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
-///  same word type.
-template <typename F2, typename F6, typename W>
-inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
-        const W *subKeys, size_t rounds, const byte *inBlocks,
-        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    CRYPTOPP_ASSERT(subKeys);
-    CRYPTOPP_ASSERT(inBlocks);
-    CRYPTOPP_ASSERT(outBlocks);
-    CRYPTOPP_ASSERT(length >= 8);
-
-    const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
-    const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
-    const uint32x4_t s_one = vld1q_u32(w_one);
-    const uint32x4_t s_two = vld1q_u32(w_two);
-
-    const size_t blockSize = 8;
-    const size_t neonBlockSize = 16;
-
-    size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
-    size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
-    size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
-
-    // Clang and Coverity are generating findings using xorBlocks as a flag.
-    const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
-    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
-
-    if (flags & BT_ReverseDirection)
-    {
-        inBlocks = PtrAdd(inBlocks, length - neonBlockSize);
-        xorBlocks = PtrAdd(xorBlocks, length - neonBlockSize);
-        outBlocks = PtrAdd(outBlocks, length - neonBlockSize);
-        inIncrement = 0-inIncrement;
-        xorIncrement = 0-xorIncrement;
-        outIncrement = 0-outIncrement;
-    }
-
-    if (flags & BT_AllowParallel)
-    {
-        while (length >= 6*neonBlockSize)
-        {
-            uint32x4_t block0, block1, block2, block3, block4, block5;
-            if (flags & BT_InBlockIsCounter)
-            {
-                // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
-                // After the dup load we have two counters in the NEON word. Then we need
-                // to increment the low ctr by 0 and the high ctr by 1.
-                const uint8x8_t ctr = vld1_u8(inBlocks);
-                block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
-
-                // After initial increment of {0,1} remaining counters increment by {2,2}.
-                block1 = vaddq_u32(s_two, block0);
-                block2 = vaddq_u32(s_two, block1);
-                block3 = vaddq_u32(s_two, block2);
-                block4 = vaddq_u32(s_two, block3);
-                block5 = vaddq_u32(s_two, block4);
-
-                vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
-                    vreinterpretq_u8_u32(vaddq_u32(s_two, block5))));
-            }
-            else
-            {
-                block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-            }
-
-            if (xorInput)
-            {
-                block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-
-            length -= 6*neonBlockSize;
-        }
-
-        while (length >= 2*neonBlockSize)
-        {
-            uint32x4_t block0, block1;
-            if (flags & BT_InBlockIsCounter)
-            {
-                // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
-                // After the dup load we have two counters in the NEON word. Then we need
-                // to increment the low ctr by 0 and the high ctr by 1.
-                const uint8x8_t ctr = vld1_u8(inBlocks);
-                block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
-
-                // After initial increment of {0,1} remaining counters increment by {2,2}.
-                block1 = vaddq_u32(s_two, block0);
-
-                vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
-                    vreinterpretq_u8_u32(vaddq_u32(s_two, block1))));
-            }
-            else
-            {
-                block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-            }
-
-            if (xorInput)
-            {
-                block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-
-            length -= 2*neonBlockSize;
-        }
-    }
-
-    if (length)
-    {
-        // Adjust to real block size
-        if (flags & BT_ReverseDirection)
-        {
-            inIncrement += inIncrement ? blockSize : 0;
-            xorIncrement += xorIncrement ? blockSize : 0;
-            outIncrement += outIncrement ? blockSize : 0;
-            inBlocks = PtrSub(inBlocks, inIncrement);
-            xorBlocks = PtrSub(xorBlocks, xorIncrement);
-            outBlocks = PtrSub(outBlocks, outIncrement);
-        }
-        else
-        {
-            inIncrement -= inIncrement ? blockSize : 0;
-            xorIncrement -= xorIncrement ? blockSize : 0;
-            outIncrement -= outIncrement ? blockSize : 0;
-        }
-
-        while (length >= blockSize)
-        {
-            uint32x4_t block, zero = {0};
-
-            const uint8x8_t v = vld1_u8(inBlocks);
-            block = vreinterpretq_u32_u8(vcombine_u8(v,v));
-
-            if (xorInput)
-            {
-                const uint8x8_t x = vld1_u8(xorBlocks);
-                block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
-            }
-
-            if (flags & BT_InBlockIsCounter)
-                const_cast<byte *>(inBlocks)[7]++;
-
-            func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                const uint8x8_t x = vld1_u8(xorBlocks);
-                block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
-            }
-
-            vst1_u8(const_cast<byte*>(outBlocks),
-                vget_low_u8(vreinterpretq_u8_u32(block)));
-
-            inBlocks = PtrAdd(inBlocks, inIncrement);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            length -= blockSize;
-        }
-    }
-
-    return length;
-}
-
 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
 /// \tparam F1 function to process 1 128-bit block
 /// \tparam F6 function to process 6 128-bit blocks
@@ -870,412 +622,6 @@ NAMESPACE_END  // CryptoPP
 
 NAMESPACE_BEGIN(CryptoPP)
 
-/// \brief AdvancedProcessBlocks for 1 and 2 blocks
-/// \tparam F1 function to process 1 64-bit block
-/// \tparam F2 function to process 2 64-bit blocks
-/// \tparam W word type of the subkey table
-/// \details AdvancedProcessBlocks64_2x1_SSE processes 2 and 1 SSE SIMD words
-///  at a time.
-/// \details The subkey type is usually word32 or word64. F1 and F2 must use the
-///  same word type.
-template <typename F1, typename F2, typename W>
-inline size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
-        MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
-        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    CRYPTOPP_ASSERT(subKeys);
-    CRYPTOPP_ASSERT(inBlocks);
-    CRYPTOPP_ASSERT(outBlocks);
-    CRYPTOPP_ASSERT(length >= 8);
-
-    const size_t blockSize = 8;
-    const size_t xmmBlockSize = 16;
-
-    size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
-    size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
-    size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
-
-    // Clang and Coverity are generating findings using xorBlocks as a flag.
-    const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
-    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
-
-    if (flags & BT_ReverseDirection)
-    {
-        inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
-        xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
-        outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
-        inIncrement = 0-inIncrement;
-        xorIncrement = 0-xorIncrement;
-        outIncrement = 0-outIncrement;
-    }
-
-    if (flags & BT_AllowParallel)
-    {
-        double temp[2];
-        while (length >= 2*xmmBlockSize)
-        {
-            __m128i block0, block1;
-            if (flags & BT_InBlockIsCounter)
-            {
-                // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
-                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
-                const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
-
-                // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
-                // After the dup load we have two counters in the XMM word. Then we need
-                // to increment the low ctr by 0 and the high ctr by 1.
-                std::memcpy(temp, inBlocks, blockSize);
-                block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
-
-                // After initial increment of {0,1} remaining counters increment by {2,2}.
-                block1 = _mm_add_epi32(s_two, block0);
-
-                // Store the next counter. When BT_InBlockIsCounter is set then
-                // inBlocks is backed by m_counterArray which is non-const.
-                _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
-                std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
-            }
-            else
-            {
-                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-            }
-
-            if (xorInput)
-            {
-                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            _mm_storeu_si128(M128_CAST(outBlocks), block0);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block1);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-
-            length -= 2*xmmBlockSize;
-        }
-    }
-
-    if (length)
-    {
-        // Adjust to real block size
-        if (flags & BT_ReverseDirection)
-        {
-            inIncrement += inIncrement ? blockSize : 0;
-            xorIncrement += xorIncrement ? blockSize : 0;
-            outIncrement += outIncrement ? blockSize : 0;
-            inBlocks = PtrSub(inBlocks, inIncrement);
-            xorBlocks = PtrSub(xorBlocks, xorIncrement);
-            outBlocks = PtrSub(outBlocks, outIncrement);
-        }
-        else
-        {
-            inIncrement -= inIncrement ? blockSize : 0;
-            xorIncrement -= xorIncrement ? blockSize : 0;
-            outIncrement -= outIncrement ? blockSize : 0;
-        }
-
-        while (length >= blockSize)
-        {
-            double temp[2];
-            std::memcpy(temp, inBlocks, blockSize);
-            __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
-
-            if (xorInput)
-            {
-                std::memcpy(temp, xorBlocks, blockSize);
-                block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
-            }
-
-            if (flags & BT_InBlockIsCounter)
-                const_cast<byte *>(inBlocks)[7]++;
-
-            func1(block, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                std::memcpy(temp, xorBlocks, blockSize);
-                block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
-            }
-
-            _mm_store_sd(temp, _mm_castsi128_pd(block));
-            std::memcpy(outBlocks, temp, blockSize);
-
-            inBlocks = PtrAdd(inBlocks, inIncrement);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            length -= blockSize;
-        }
-    }
-
-    return length;
-}
-
-/// \brief AdvancedProcessBlocks for 2 and 6 blocks
-/// \tparam F2 function to process 2 64-bit blocks
-/// \tparam F6 function to process 6 64-bit blocks
-/// \tparam W word type of the subkey table
-/// \details AdvancedProcessBlocks64_6x2_SSE processes 6 and 2 SSE SIMD words
-///  at a time. For a single block the template uses F2 with a zero block.
-/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
-///  same word type.
-template <typename F2, typename F6, typename W>
-inline size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
-        MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
-        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    CRYPTOPP_ASSERT(subKeys);
-    CRYPTOPP_ASSERT(inBlocks);
-    CRYPTOPP_ASSERT(outBlocks);
-    CRYPTOPP_ASSERT(length >= 8);
-
-    const size_t blockSize = 8;
-    const size_t xmmBlockSize = 16;
-
-    size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
-    size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
-    size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
-
-    // Clang and Coverity are generating findings using xorBlocks as a flag.
-    const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
-    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
-
-    if (flags & BT_ReverseDirection)
-    {
-        inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
-        xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
-        outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
-        inIncrement = 0-inIncrement;
-        xorIncrement = 0-xorIncrement;
-        outIncrement = 0-outIncrement;
-    }
-
-    if (flags & BT_AllowParallel)
-    {
-        double temp[2];
-        while (length >= 6*xmmBlockSize)
-        {
-            __m128i block0, block1, block2, block3, block4, block5;
-            if (flags & BT_InBlockIsCounter)
-            {
-                // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
-                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
-                const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
-
-                // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
-                // After the dup load we have two counters in the XMM word. Then we need
-                // to increment the low ctr by 0 and the high ctr by 1.
-                std::memcpy(temp, inBlocks, blockSize);
-                block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
-
-                // After initial increment of {0,1} remaining counters increment by {2,2}.
-                block1 = _mm_add_epi32(s_two, block0);
-                block2 = _mm_add_epi32(s_two, block1);
-                block3 = _mm_add_epi32(s_two, block2);
-                block4 = _mm_add_epi32(s_two, block3);
-                block5 = _mm_add_epi32(s_two, block4);
-
-                // Store the next counter. When BT_InBlockIsCounter is set then
-                // inBlocks is backed by m_counterArray which is non-const.
-                _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5)));
-                std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
-            }
-            else
-            {
-                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-            }
-
-            if (xorInput)
-            {
-                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            _mm_storeu_si128(M128_CAST(outBlocks), block0);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block1);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block2);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block3);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block4);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block5);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-
-            length -= 6*xmmBlockSize;
-        }
-
-        while (length >= 2*xmmBlockSize)
-        {
-            __m128i block0, block1;
-            if (flags & BT_InBlockIsCounter)
-            {
-                // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
-                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
-                const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
-
-                // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
-                // After the dup load we have two counters in the XMM word. Then we need
-                // to increment the low ctr by 0 and the high ctr by 1.
-                std::memcpy(temp, inBlocks, blockSize);
-                block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
-
-                // After initial increment of {0,1} remaining counters increment by {2,2}.
-                block1 = _mm_add_epi32(s_two, block0);
-
-                // Store the next counter. When BT_InBlockIsCounter is set then
-                // inBlocks is backed by m_counterArray which is non-const.
-                _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
-                std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
-            }
-            else
-            {
-                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-            }
-
-            if (xorInput)
-            {
-                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            _mm_storeu_si128(M128_CAST(outBlocks), block0);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block1);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-
-            length -= 2*xmmBlockSize;
-        }
-    }
-
-    if (length)
-    {
-        // Adjust to real block size
-        if (flags & BT_ReverseDirection)
-        {
-            inIncrement += inIncrement ? blockSize : 0;
-            xorIncrement += xorIncrement ? blockSize : 0;
-            outIncrement += outIncrement ? blockSize : 0;
-            inBlocks = PtrSub(inBlocks, inIncrement);
-            xorBlocks = PtrSub(xorBlocks, xorIncrement);
-            outBlocks = PtrSub(outBlocks, outIncrement);
-        }
-        else
-        {
-            inIncrement -= inIncrement ? blockSize : 0;
-            xorIncrement -= xorIncrement ? blockSize : 0;
-            outIncrement -= outIncrement ? blockSize : 0;
-        }
-
-        while (length >= blockSize)
-        {
-            double temp[2];
-            __m128i block, zero = _mm_setzero_si128();
-            std::memcpy(temp, inBlocks, blockSize);
-            block = _mm_castpd_si128(_mm_load_sd(temp));
-
-            if (xorInput)
-            {
-                std::memcpy(temp, xorBlocks, blockSize);
-                block = _mm_xor_si128(block,
-                    _mm_castpd_si128(_mm_load_sd(temp)));
-            }
-
-            if (flags & BT_InBlockIsCounter)
-                const_cast<byte *>(inBlocks)[7]++;
-
-            func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                std::memcpy(temp, xorBlocks, blockSize);
-                block = _mm_xor_si128(block,
-                    _mm_castpd_si128(_mm_load_sd(temp)));
-            }
-
-            _mm_store_sd(temp, _mm_castsi128_pd(block));
-            std::memcpy(outBlocks, temp, blockSize);
-
-            inBlocks = PtrAdd(inBlocks, inIncrement);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            length -= blockSize;
-        }
-    }
-
-    return length;
-}
-
 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
 /// \tparam F2 function to process 2 128-bit blocks
 /// \tparam F6 function to process 6 128-bit blocks
@@ -1602,179 +948,6 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
     return length;
 }
 
-/// \brief AdvancedProcessBlocks for 1 and 4 blocks
-/// \tparam F1 function to process 1 64-bit block
-/// \tparam F4 function to process 6 64-bit blocks
-/// \tparam W word type of the subkey table
-/// \details AdvancedProcessBlocks64_4x1_SSE processes 4 and 1 SSE SIMD words
-///  at a time.
-/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
-///  same word type.
-template <typename F1, typename F4, typename W>
-inline size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4,
-    MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
-    const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    CRYPTOPP_ASSERT(subKeys);
-    CRYPTOPP_ASSERT(inBlocks);
-    CRYPTOPP_ASSERT(outBlocks);
-    CRYPTOPP_ASSERT(length >= 8);
-
-    const size_t blockSize = 8;
-    const size_t xmmBlockSize = 16;
-
-    size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
-    size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
-    size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
-
-    // Clang and Coverity are generating findings using xorBlocks as a flag.
-    const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
-    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
-
-    if (flags & BT_ReverseDirection)
-    {
-        inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
-        xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
-        outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
-        inIncrement = 0 - inIncrement;
-        xorIncrement = 0 - xorIncrement;
-        outIncrement = 0 - outIncrement;
-    }
-
-    if (flags & BT_AllowParallel)
-    {
-        while (length >= 4*xmmBlockSize)
-        {
-            __m128i block0, block1, block2, block3;
-            if (flags & BT_InBlockIsCounter)
-            {
-                // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
-                const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
-                const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
-                double temp[2];
-
-                // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
-                // After the dup load we have two counters in the XMM word. Then we need
-                // to increment the low ctr by 0 and the high ctr by 1.
-                std::memcpy(temp, inBlocks, blockSize);
-                block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
-
-                // After initial increment of {0,1} remaining counters increment by {2,2}.
-                block1 = _mm_add_epi32(s_two, block0);
-                block2 = _mm_add_epi32(s_two, block1);
-                block3 = _mm_add_epi32(s_two, block2);
-
-                // Store the next counter. When BT_InBlockIsCounter is set then
-                // inBlocks is backed by m_counterArray which is non-const.
-                _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3)));
-                std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
-            }
-            else
-            {
-                block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-            }
-
-            if (xorInput)
-            {
-                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            _mm_storeu_si128(M128_CAST(outBlocks), block0);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block1);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block2);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            _mm_storeu_si128(M128_CAST(outBlocks), block3);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-
-            length -= 4*xmmBlockSize;
-        }
-    }
-
-    if (length)
-    {
-        // Adjust to real block size
-        if (flags & BT_ReverseDirection)
-        {
-            inIncrement += inIncrement ? blockSize : 0;
-            xorIncrement += xorIncrement ? blockSize : 0;
-            outIncrement += outIncrement ? blockSize : 0;
-            inBlocks = PtrSub(inBlocks, inIncrement);
-            xorBlocks = PtrSub(xorBlocks, xorIncrement);
-            outBlocks = PtrSub(outBlocks, outIncrement);
-        }
-        else
-        {
-            inIncrement -= inIncrement ? blockSize : 0;
-            xorIncrement -= xorIncrement ? blockSize : 0;
-            outIncrement -= outIncrement ? blockSize : 0;
-        }
-
-        while (length >= blockSize)
-        {
-            double temp[2];
-            std::memcpy(temp, inBlocks, blockSize);
-            __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
-
-            if (xorInput)
-            {
-                std::memcpy(temp, xorBlocks, blockSize);
-                block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
-            }
-
-            if (flags & BT_InBlockIsCounter)
-                const_cast<byte *>(inBlocks)[7]++;
-
-            func1(block, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                std::memcpy(temp, xorBlocks, blockSize);
-                block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
-            }
-
-            _mm_store_sd(temp, _mm_castsi128_pd(block));
-            std::memcpy(outBlocks, temp, blockSize);
-
-            inBlocks = PtrAdd(inBlocks, inIncrement);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            length -= blockSize;
-        }
-    }
-
-    return length;
-}
-
 NAMESPACE_END  // CryptoPP
 
 #endif  // CRYPTOPP_SSSE3_AVAILABLE
@@ -1785,277 +958,6 @@ NAMESPACE_END  // CryptoPP
 
 NAMESPACE_BEGIN(CryptoPP)
 
-/// \brief AdvancedProcessBlocks for 2 and 6 blocks
-/// \tparam F2 function to process 2 128-bit blocks
-/// \tparam F6 function to process 6 128-bit blocks
-/// \tparam W word type of the subkey table
-/// \details AdvancedProcessBlocks64_6x2_Altivec processes 6 and 2 Altivec SIMD words
-///  at a time. For a single block the template uses F2 with a zero block.
-/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
-///  same word type.
-template <typename F2, typename F6, typename W>
-inline size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
-        const W *subKeys, size_t rounds, const byte *inBlocks,
-        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    CRYPTOPP_ASSERT(subKeys);
-    CRYPTOPP_ASSERT(inBlocks);
-    CRYPTOPP_ASSERT(outBlocks);
-    CRYPTOPP_ASSERT(length >= 8);
-
-#if (CRYPTOPP_LITTLE_ENDIAN)
-    enum {LowOffset=8, HighOffset=0};
-    const uint32x4_p s_one  = {1,0,0,0};
-    const uint32x4_p s_two  = {2,0,2,0};
-#else
-    enum {LowOffset=8, HighOffset=0};
-    const uint32x4_p s_one = {0,0,0,1};
-    const uint32x4_p s_two = {0,2,0,2};
-#endif
-
-    const size_t blockSize = 8;
-    const size_t simdBlockSize = 16;
-    CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16];
-
-    size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : simdBlockSize;
-    size_t xorIncrement = (xorBlocks != NULLPTR) ? simdBlockSize : 0;
-    size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : simdBlockSize;
-
-    // Clang and Coverity are generating findings using xorBlocks as a flag.
-    const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
-    const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
-
-    if (flags & BT_ReverseDirection)
-    {
-        inBlocks = PtrAdd(inBlocks, length - simdBlockSize);
-        xorBlocks = PtrAdd(xorBlocks, length - simdBlockSize);
-        outBlocks = PtrAdd(outBlocks, length - simdBlockSize);
-        inIncrement = 0-inIncrement;
-        xorIncrement = 0-xorIncrement;
-        outIncrement = 0-outIncrement;
-    }
-
-    if (flags & BT_AllowParallel)
-    {
-        while (length >= 6*simdBlockSize)
-        {
-            uint32x4_p block0, block1, block2, block3, block4, block5;
-            if (flags & BT_InBlockIsCounter)
-            {
-                // There is no easy way to load 8-bytes into a vector. It is
-                // even harder without POWER8 due to lack of 64-bit elements.
-                std::memcpy(temp+LowOffset, inBlocks, 8);
-                std::memcpy(temp+HighOffset, inBlocks, 8);
-                uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
-
-                // For 64-bit block ciphers we need to load the CTR block,
-                // which is 8 bytes. After the dup load we have two counters
-                // in the Altivec word. Then we need to increment the low ctr
-                // by 0 and the high ctr by 1.
-                block0 = VecAdd(s_one, ctr);
-
-                // After initial increment of {0,1} remaining counters
-                // increment by {2,2}.
-                block1 = VecAdd(s_two, block0);
-                block2 = VecAdd(s_two, block1);
-                block3 = VecAdd(s_two, block2);
-                block4 = VecAdd(s_two, block3);
-                block5 = VecAdd(s_two, block4);
-
-                // Update the counter in the caller.
-                const_cast<byte*>(inBlocks)[7] += 12;
-            }
-            else
-            {
-                block0 = VecLoadBE(inBlocks);
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = VecLoadBE(inBlocks);
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block2 = VecLoadBE(inBlocks);
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block3 = VecLoadBE(inBlocks);
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block4 = VecLoadBE(inBlocks);
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block5 = VecLoadBE(inBlocks);
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-            }
-
-            if (xorInput)
-            {
-                block0 = VecXor(block0, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = VecXor(block1, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = VecXor(block2, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = VecXor(block3, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block4 = VecXor(block4, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block5 = VecXor(block5, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                block0 = VecXor(block0, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = VecXor(block1, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = VecXor(block2, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = VecXor(block3, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block4 = VecXor(block4, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block5 = VecXor(block5, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            VecStoreBE(block0, outBlocks);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VecStoreBE(block1, outBlocks);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VecStoreBE(block2, outBlocks);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VecStoreBE(block3, outBlocks);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VecStoreBE(block4, outBlocks);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VecStoreBE(block5, outBlocks);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-
-            length -= 6*simdBlockSize;
-        }
-
-        while (length >= 2*simdBlockSize)
-        {
-            uint32x4_p block0, block1;
-            if (flags & BT_InBlockIsCounter)
-            {
-                // There is no easy way to load 8-bytes into a vector. It is
-                // even harder without POWER8 due to lack of 64-bit elements.
-                std::memcpy(temp+LowOffset, inBlocks, 8);
-                std::memcpy(temp+HighOffset, inBlocks, 8);
-                uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
-
-                // For 64-bit block ciphers we need to load the CTR block,
-                // which is 8 bytes. After the dup load we have two counters
-                // in the Altivec word. Then we need to increment the low ctr
-                // by 0 and the high ctr by 1.
-                block0 = VecAdd(s_one, ctr);
-
-                // After initial increment of {0,1} remaining counters
-                // increment by {2,2}.
-                block1 = VecAdd(s_two, block0);
-
-                // Update the counter in the caller.
-                const_cast<byte*>(inBlocks)[7] += 4;
-            }
-            else
-            {
-                block0 = VecLoadBE(inBlocks);
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = VecLoadBE(inBlocks);
-                inBlocks = PtrAdd(inBlocks, inIncrement);
-            }
-
-            if (xorInput)
-            {
-                block0 = VecXor(block0, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = VecXor(block1, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                block0 = VecXor(block0, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = VecXor(block1, VecLoadBE(xorBlocks));
-                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            }
-
-            VecStoreBE(block0, outBlocks);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VecStoreBE(block1, outBlocks);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-
-            length -= 2*simdBlockSize;
-        }
-    }
-
-    if (length)
-    {
-        // Adjust to real block size
-        if (flags & BT_ReverseDirection)
-        {
-            inIncrement += inIncrement ? blockSize : 0;
-            xorIncrement += xorIncrement ? blockSize : 0;
-            outIncrement += outIncrement ? blockSize : 0;
-            inBlocks = PtrSub(inBlocks, inIncrement);
-            xorBlocks = PtrSub(xorBlocks, xorIncrement);
-            outBlocks = PtrSub(outBlocks, outIncrement);
-        }
-        else
-        {
-            inIncrement -= inIncrement ? blockSize : 0;
-            xorIncrement -= xorIncrement ? blockSize : 0;
-            outIncrement -= outIncrement ? blockSize : 0;
-        }
-
-        while (length >= blockSize)
-        {
-            uint32x4_p block, zero = {0};
-
-            // There is no easy way to load 8-bytes into a vector. It is
-            // even harder without POWER8 due to lack of 64-bit elements.
-            // The high 8 bytes are "don't care" but it if we don't
-            // initialize the block then it generates warnings.
-            std::memcpy(temp+LowOffset, inBlocks, 8);
-            std::memcpy(temp+HighOffset, inBlocks, 8);  // don't care
-            block = (uint32x4_p)VecLoadBE(temp);
-
-            if (xorInput)
-            {
-                std::memcpy(temp+LowOffset, xorBlocks, 8);
-                std::memcpy(temp+HighOffset, xorBlocks, 8);  // don't care
-                uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
-                block = VecXor(block, x);
-            }
-
-            // Update the counter in the caller.
-            if (flags & BT_InBlockIsCounter)
-                const_cast<byte *>(inBlocks)[7]++;
-
-            func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
-
-            if (xorOutput)
-            {
-                std::memcpy(temp+LowOffset, xorBlocks, 8);
-                std::memcpy(temp+HighOffset, xorBlocks, 8);  // don't care
-                uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
-                block = VecXor(block, x);
-            }
-
-            VecStoreBE(block, temp);
-            std::memcpy(outBlocks, temp+LowOffset, 8);
-
-            inBlocks = PtrAdd(inBlocks, inIncrement);
-            outBlocks = PtrAdd(outBlocks, outIncrement);
-            xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-            length -= blockSize;
-        }
-    }
-
-    return length;
-}
-
 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
 /// \tparam F1 function to process 1 128-bit block
 /// \tparam F4 function to process 4 128-bit blocks
diff --git a/cham.cpp b/cham.cpp
index 620b7953..1a05721d 100644
--- a/cham.cpp
+++ b/cham.cpp
@@ -96,7 +96,7 @@ ANONYMOUS_NAMESPACE_END
 
 NAMESPACE_BEGIN(CryptoPP)
 
-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
 # if (CRYPTOPP_SSSE3_AVAILABLE)
 extern size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@@ -110,11 +110,11 @@ extern size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, siz
 extern size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 # endif  // CRYPTOPP_SSSE3_AVAILABLE
-#endif  // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#endif  // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
 
 std::string CHAM64::Base::AlgorithmProvider() const
 {
-#if (CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS)
+#if (CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS)
 # if defined(CRYPTOPP_SSSE3_AVAILABLE)
     if (HasSSSE3())
         return "SSSE3";
@@ -336,31 +336,7 @@ void CHAM128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock,
     oblock(m_x[0])(m_x[1])(m_x[2])(m_x[3]);
 }
 
-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
-size_t CHAM64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3()) {
-        return CHAM64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, 80,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-    }
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t CHAM64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3()) {
-        return CHAM64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, 80,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-    }
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
 size_t CHAM128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
         byte *outBlocks, size_t length, word32 flags) const
 {
@@ -386,6 +362,6 @@ size_t CHAM128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor
 # endif  // CRYPTOPP_SSSE3_AVAILABLE
     return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
-#endif  // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#endif  // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
 
 NAMESPACE_END
diff --git a/cham.h b/cham.h
index 9408e407..30a84b4e 100644
--- a/cham.h
+++ b/cham.h
@@ -16,18 +16,15 @@
 #include "algparam.h"
 
 #if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
-# define CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS 1
+# define CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS 1
 #endif
 
 // Yet another SunStudio/SunCC workaround. Failed self tests
 // in SSE code paths on i386 for SunStudio 12.3 and below.
 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-# undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+# undef CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
 #endif
 
-// https://github.com/weidai11/cryptopp/issues/945
-#undef CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_BEGIN(CryptoPP)
 
 /// \brief CHAM block cipher information
@@ -92,10 +89,6 @@ public:
     {
     public:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-
-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
     };
 
     /// \brief Decryption transformation
@@ -106,10 +99,6 @@ public:
     {
     public:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-
-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
     };
 
     /// \brief CHAM64 encryption
@@ -156,7 +145,7 @@ public:
     public:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
 
-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
         size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
 #endif
     };
@@ -170,7 +159,7 @@ public:
     public:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
 
-#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
         size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
 #endif
     };
diff --git a/cham_simd.cpp b/cham_simd.cpp
index 42a76c6e..b848ba10 100644
--- a/cham_simd.cpp
+++ b/cham_simd.cpp
@@ -45,600 +45,6 @@ using CryptoPP::word32;
 
 //////////////////////////////////////////////////////////////////////////
 
-NAMESPACE_BEGIN(W16)  // CHAM64, 16-bit word size
-
-template <unsigned int R>
-inline __m128i RotateLeft16(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi16(val, R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateRight16(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi16(val, 16-R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
-#endif
-}
-
-template <>
-inline __m128i RotateLeft16<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi16(val, 8);
-#else
-    const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-template <>
-inline __m128i RotateRight16<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi16(val, 16-8);
-#else
-    const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                         const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // Should not be instantiated
-    CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
-    CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
-    CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
-    CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
-    CRYPTOPP_ASSERT(0);
-    return _mm_setzero_si128();
-}
-
-template <>
-inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpacklo_epi16(a, b);
-    const __m128i r2 = _mm_unpacklo_epi16(c, d);
-    const __m128i r3 = _mm_unpacklo_epi16(e, f);
-    const __m128i r4 = _mm_unpacklo_epi16(g, h);
-
-    const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
-    const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpacklo_epi16(a, b);
-    const __m128i r2 = _mm_unpacklo_epi16(c, d);
-    const __m128i r3 = _mm_unpacklo_epi16(e, f);
-    const __m128i r4 = _mm_unpacklo_epi16(g, h);
-
-    const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
-    const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpacklo_epi16(a, b);
-    const __m128i r2 = _mm_unpacklo_epi16(c, d);
-    const __m128i r3 = _mm_unpacklo_epi16(e, f);
-    const __m128i r4 = _mm_unpacklo_epi16(g, h);
-
-    const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
-    const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpacklo_epi16(a, b);
-    const __m128i r2 = _mm_unpacklo_epi16(c, d);
-    const __m128i r3 = _mm_unpacklo_epi16(e, f);
-    const __m128i r4 = _mm_unpacklo_epi16(g, h);
-
-    const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
-    const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpackhi_epi16(a, b);
-    const __m128i r2 = _mm_unpackhi_epi16(c, d);
-    const __m128i r3 = _mm_unpackhi_epi16(e, f);
-    const __m128i r4 = _mm_unpackhi_epi16(g, h);
-
-    const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
-    const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpackhi_epi16(a, b);
-    const __m128i r2 = _mm_unpackhi_epi16(c, d);
-    const __m128i r3 = _mm_unpackhi_epi16(e, f);
-    const __m128i r4 = _mm_unpackhi_epi16(g, h);
-
-    const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
-    const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpackhi_epi16(a, b);
-    const __m128i r2 = _mm_unpackhi_epi16(c, d);
-    const __m128i r3 = _mm_unpackhi_epi16(e, f);
-    const __m128i r4 = _mm_unpackhi_epi16(g, h);
-
-    const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
-    const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                            const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    // The shuffle converts to and from little-endian for SSE. A specialized
-    // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decryption and benchmarks. The library cannot take the
-    // speed-up because of the byte oriented API.
-    const __m128i r1 = _mm_unpackhi_epi16(a, b);
-    const __m128i r2 = _mm_unpackhi_epi16(c, d);
-    const __m128i r3 = _mm_unpackhi_epi16(e, f);
-    const __m128i r4 = _mm_unpackhi_epi16(g, h);
-
-    const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
-    const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
-        _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
-}
-
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& v)
-{
-    // Should not be instantiated
-    CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
-
-    return _mm_setzero_si128();
-}
-
-template <>
-inline __m128i UnpackXMM<0>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
-}
-
-template <>
-inline __m128i UnpackXMM<1>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<2>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
-}
-
-template <>
-inline __m128i UnpackXMM<3>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
-}
-
-template <>
-inline __m128i UnpackXMM<4>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
-}
-
-template <>
-inline __m128i UnpackXMM<5>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
-}
-
-template <>
-inline __m128i UnpackXMM<6>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
-}
-
-template <>
-inline __m128i UnpackXMM<7>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
-}
-
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& a, const __m128i& b)
-{
-    const __m128i& z = _mm_setzero_si128();
-    return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
-}
-
-template <unsigned int IDX>
-inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
-                         const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
-{
-    return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
-}
-
-template <unsigned int IDX>
-inline __m128i RepackXMM(const __m128i& v)
-{
-    return UnpackXMM<IDX>(v);
-}
-
-inline void CHAM64_Enc_Block(__m128i &block0,
-    const word16 *subkeys, unsigned int /*rounds*/)
-{
-    // Rearrange the data for vectorization. UnpackXMM includes a
-    // little-endian swap for SSE. Thanks to Peter Cordes for help
-    // with packing and unpacking.
-    // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
-    __m128i a = UnpackXMM<0>(block0);
-    __m128i b = UnpackXMM<1>(block0);
-    __m128i c = UnpackXMM<2>(block0);
-    __m128i d = UnpackXMM<3>(block0);
-    __m128i e = UnpackXMM<4>(block0);
-    __m128i f = UnpackXMM<5>(block0);
-    __m128i g = UnpackXMM<6>(block0);
-    __m128i h = UnpackXMM<7>(block0);
-
-    const unsigned int rounds = 80;
-    __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
-    __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
-
-    const unsigned int MASK = 15;
-    for (int i=0; i<static_cast<int>(rounds); i+=4)
-    {
-        __m128i k, kr, t1, t2, t3, t4;
-        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
-
-        // Shuffle out key
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
-
-        t1 = _mm_xor_si128(a, counter);
-        t3 = _mm_xor_si128(e, counter);
-        t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
-        a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
-        e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
-
-        t1 = _mm_xor_si128(b, counter);
-        t3 = _mm_xor_si128(f, counter);
-        t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
-        b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
-        f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
-
-        t1 = _mm_xor_si128(c, counter);
-        t3 = _mm_xor_si128(g, counter);
-        t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
-        c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
-        g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
-
-        t1 = _mm_xor_si128(d, counter);
-        t3 = _mm_xor_si128(h, counter);
-        t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
-        d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
-        h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-    }
-
-    // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
-    block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
-}
-
-inline void CHAM64_Dec_Block(__m128i &block0,
-    const word16 *subkeys, unsigned int /*rounds*/)
-{
-    // Rearrange the data for vectorization. UnpackXMM includes a
-    // little-endian swap for SSE. Thanks to Peter Cordes for help
-    // with packing and unpacking.
-    // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
-    __m128i a = UnpackXMM<0>(block0);
-    __m128i b = UnpackXMM<1>(block0);
-    __m128i c = UnpackXMM<2>(block0);
-    __m128i d = UnpackXMM<3>(block0);
-    __m128i e = UnpackXMM<4>(block0);
-    __m128i f = UnpackXMM<5>(block0);
-    __m128i g = UnpackXMM<6>(block0);
-    __m128i h = UnpackXMM<7>(block0);
-
-    const unsigned int rounds = 80;
-    __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
-    __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
-
-    const unsigned int MASK = 15;
-    for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
-    {
-        __m128i k, kr, t1, t2, t3, t4;
-        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
-
-        // Shuffle out key
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
-
-        // Odd round
-        t1 = RotateRight16<1>(d);
-        t3 = RotateRight16<1>(h);
-        t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
-        d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
-
-        // Even round
-        t1 = RotateRight16<8>(c);
-        t3 = RotateRight16<8>(g);
-        t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
-        c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
-
-        // Odd round
-        t1 = RotateRight16<1>(b);
-        t3 = RotateRight16<1>(f);
-        t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
-        b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
-
-        // Even round
-        t1 = RotateRight16<8>(a);
-        t3 = RotateRight16<8>(e);
-        t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
-        a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-    }
-
-    // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
-    block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
-}
-
-inline void CHAM64_Enc_2_Blocks(__m128i &block0,
-    __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
-{
-    // Rearrange the data for vectorization. UnpackXMM includes a
-    // little-endian swap for SSE. Thanks to Peter Cordes for help
-    // with packing and unpacking.
-    // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
-    __m128i a = UnpackXMM<0>(block0, block1);
-    __m128i b = UnpackXMM<1>(block0, block1);
-    __m128i c = UnpackXMM<2>(block0, block1);
-    __m128i d = UnpackXMM<3>(block0, block1);
-    __m128i e = UnpackXMM<4>(block0, block1);
-    __m128i f = UnpackXMM<5>(block0, block1);
-    __m128i g = UnpackXMM<6>(block0, block1);
-    __m128i h = UnpackXMM<7>(block0, block1);
-
-    const unsigned int rounds = 80;
-    __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
-    __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
-
-    const unsigned int MASK = 15;
-    for (int i=0; i<static_cast<int>(rounds); i+=4)
-    {
-        __m128i k, kr, t1, t2, t3, t4;
-        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
-
-        // Shuffle out key
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
-
-        t1 = _mm_xor_si128(a, counter);
-        t3 = _mm_xor_si128(e, counter);
-        t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
-        a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
-        e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
-
-        t1 = _mm_xor_si128(b, counter);
-        t3 = _mm_xor_si128(f, counter);
-        t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
-        b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
-        f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
-
-        t1 = _mm_xor_si128(c, counter);
-        t3 = _mm_xor_si128(g, counter);
-        t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
-        c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
-        g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
-
-        t1 = _mm_xor_si128(d, counter);
-        t3 = _mm_xor_si128(h, counter);
-        t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
-        d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
-        h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
-
-        counter = _mm_add_epi16(counter, increment);
-    }
-
-    // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
-    block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
-    block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
-}
-
-inline void CHAM64_Dec_2_Blocks(__m128i &block0,
-    __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
-{
-    // Rearrange the data for vectorization. UnpackXMM includes a
-    // little-endian swap for SSE. Thanks to Peter Cordes for help
-    // with packing and unpacking.
-    // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
-    __m128i a = UnpackXMM<0>(block0, block1);
-    __m128i b = UnpackXMM<1>(block0, block1);
-    __m128i c = UnpackXMM<2>(block0, block1);
-    __m128i d = UnpackXMM<3>(block0, block1);
-    __m128i e = UnpackXMM<4>(block0, block1);
-    __m128i f = UnpackXMM<5>(block0, block1);
-    __m128i g = UnpackXMM<6>(block0, block1);
-    __m128i h = UnpackXMM<7>(block0, block1);
-
-    const unsigned int rounds = 80;
-    __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
-    __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
-
-    const unsigned int MASK = 15;
-    for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
-    {
-        __m128i k, kr, t1, t2, t3, t4;
-        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
-
-        // Shuffle out key
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
-
-        // Odd round
-        t1 = RotateRight16<1>(d);
-        t3 = RotateRight16<1>(h);
-        t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
-        d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
-
-        // Even round
-        t1 = RotateRight16<8>(c);
-        t3 = RotateRight16<8>(g);
-        t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
-        c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
-
-        // Odd round
-        t1 = RotateRight16<1>(b);
-        t3 = RotateRight16<1>(f);
-        t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
-        t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
-        b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-        kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
-
-        // Even round
-        t1 = RotateRight16<8>(a);
-        t3 = RotateRight16<8>(e);
-        t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
-        t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
-        a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
-        e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
-
-        counter = _mm_sub_epi16(counter, decrement);
-    }
-
-    // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
-    block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
-    block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
-}
-
-NAMESPACE_END  // W16
-
-//////////////////////////////////////////////////////////////////////////
-
 NAMESPACE_BEGIN(W32)  // CHAM128, 32-bit word size
 
 template <unsigned int R>
@@ -1054,20 +460,6 @@ ANONYMOUS_NAMESPACE_END
 NAMESPACE_BEGIN(CryptoPP)
 
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
-size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
 size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
diff --git a/cryptest.nmake b/cryptest.nmake
index a9c25798..7690a812 100644
--- a/cryptest.nmake
+++ b/cryptest.nmake
@@ -78,9 +78,9 @@ LIB_SRCS = \
     rdtables.cpp rijndael.cpp rijndael_simd.cpp ripemd.cpp rng.cpp rsa.cpp \
     rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp \
     sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp shake.cpp \
-    shark.cpp sharkbox.cpp simeck.cpp simeck_simd.cpp simon.cpp \
-    simon128_simd.cpp simon64_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \
-    sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp speck64_simd.cpp \
+    shark.cpp sharkbox.cpp simeck.cpp simon.cpp \
+    simon128_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \
+    sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp \
     square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \
     threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \
     vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp xts.cpp \
@@ -109,9 +109,9 @@ LIB_OBJS = \
     rdtables.obj rijndael.obj rijndael_simd.obj ripemd.obj rng.obj rsa.obj \
     rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj \
     sha.obj sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj shake.obj \
-    shark.obj sharkbox.obj simeck.obj simeck_simd.obj simon.obj \
-    simon128_simd.obj simon64_simd.obj skipjack.obj sm3.obj sm4.obj \
-    sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj speck64_simd.obj \
+    shark.obj sharkbox.obj simeck.obj simon.obj \
+    simon128_simd.obj skipjack.obj sm3.obj sm4.obj \
+    sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj \
     square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \
     threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \
     vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj xts.obj \
diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj
index e6a8b3b2..7c1d65ff 100644
--- a/cryptlib.vcxproj
+++ b/cryptlib.vcxproj
@@ -315,9 +315,7 @@
     <ClCompile Include="shark.cpp" />
     <ClCompile Include="sharkbox.cpp" />
     <ClCompile Include="simeck.cpp" />
-    <ClCompile Include="simeck_simd.cpp" />
     <ClCompile Include="simon.cpp" />
-    <ClCompile Include="simon64_simd.cpp" />
     <ClCompile Include="simon128_simd.cpp" />
     <ClCompile Include="simple.cpp" />
     <ClCompile Include="skipjack.cpp" />
@@ -326,7 +324,6 @@
     <ClCompile Include="sm4_simd.cpp" />
     <ClCompile Include="sosemanuk.cpp" />
     <ClCompile Include="speck.cpp" />
-    <ClCompile Include="speck64_simd.cpp" />
     <ClCompile Include="speck128_simd.cpp" />
     <ClCompile Include="square.cpp" />
     <ClCompile Include="squaretb.cpp" />
diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters
index 1b368d37..3c8252f7 100644
--- a/cryptlib.vcxproj.filters
+++ b/cryptlib.vcxproj.filters
@@ -425,15 +425,9 @@
     <ClCompile Include="simeck.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="simeck_simd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="simon.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="simon64_simd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="simon128_simd.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -455,9 +449,6 @@
     <ClCompile Include="speck.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="speck64_simd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="speck128_simd.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/simeck.cpp b/simeck.cpp
index 982f5f4a..a1a5316e 100644
--- a/simeck.cpp
+++ b/simeck.cpp
@@ -33,16 +33,6 @@ ANONYMOUS_NAMESPACE_END
 
 NAMESPACE_BEGIN(CryptoPP)
 
-#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-extern size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-#endif  // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-
 std::string SIMECK32::Base::AlgorithmProvider() const
 {
     return "C++";
@@ -104,10 +94,6 @@ void SIMECK32::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 
 std::string SIMECK64::Base::AlgorithmProvider() const
 {
-#if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3())
-        return "SSSE3";
-#endif
     return "C++";
 }
 
@@ -165,30 +151,4 @@ void SIMECK64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
     oblock(m_t[0])(m_t[1]);
 }
 
-#if CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-size_t SIMECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3()) {
-        return SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-    }
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-# if (CRYPTOPP_SSSE3_AVAILABLE)
-    if (HasSSSE3()) {
-        return SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, ROUNDS,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-    }
-# endif  // CRYPTOPP_SSSE3_AVAILABLE
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif  // CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_END
diff --git a/simeck.h b/simeck.h
index 918400e2..398198d7 100644
--- a/simeck.h
+++ b/simeck.h
@@ -17,19 +17,6 @@
 #include "secblock.h"
 #include "algparam.h"
 
-#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
-# define CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS 1
-#endif
-
-// Yet another SunStudio/SunCC workaround. Failed self tests
-// in SSE code paths on i386 for SunStudio 12.3 and below.
-#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-# undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-#endif
-
-// https://github.com/weidai11/cryptopp/issues/945
-#undef CRYPTOPP_SIMECK_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_BEGIN(CryptoPP)
 
 /// \brief SIMECK block cipher information
diff --git a/simeck_simd.cpp b/simeck_simd.cpp
deleted file mode 100644
index 03263b1d..00000000
--- a/simeck_simd.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-// simeck_simd.cpp - written and placed in the public domain by Gangqiang Yang and Jeffrey Walton.
-//
-//    This source file uses intrinsics and built-ins to gain access to
-//    SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
-//    source file is needed because additional CXXFLAGS are required to enable
-//    the appropriate instructions sets in some build configurations.
-
-#include "pch.h"
-#include "config.h"
-
-#include "simeck.h"
-#include "misc.h"
-
-// Uncomment for benchmarking C++ against SSE or NEON.
-// Do so in both simon.cpp and simon_simd.cpp.
-// #undef CRYPTOPP_SSSE3_AVAILABLE
-// #undef CRYPTOPP_ARM_NEON_AVAILABLE
-
-#if (CRYPTOPP_SSSE3_AVAILABLE)
-# include "adv_simd.h"
-# include <pmmintrin.h>
-# include <tmmintrin.h>
-#endif
-
-#if defined(__XOP__)
-# include <ammintrin.h>
-# if defined(__GNUC__)
-#  include <x86intrin.h>
-# endif
-#endif
-
-// Squash MS LNK4221 and libtool warnings
-extern const char SIMECK_SIMD_FNAME[] = __FILE__;
-
-// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
-#define M128_CAST(x) ((__m128i *)(void *)(x))
-#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
-
-ANONYMOUS_NAMESPACE_BEGIN
-
-using CryptoPP::word16;
-using CryptoPP::word32;
-
-#if (CRYPTOPP_SSSE3_AVAILABLE)
-
-//////////////////////////////////////////////////////////////////////////
-
-template <unsigned int R>
-inline __m128i RotateLeft32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateRight32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline __m128i RotateLeft32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 8);
-#else
-    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline __m128i RotateRight32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-8);
-#else
-    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-/// \brief Unpack XMM words
-/// \tparam IDX the element from each XMM word
-/// \param a the first XMM word
-/// \param b the second XMM word
-/// \param c the third XMM word
-/// \param d the fourth XMM word
-/// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation
-///   equivalent to <tt>a[IDX] || b[IDX] || c[IDX] || d[IDX]</tt>.
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    // Should not be instantiated
-    CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
-    CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
-    CRYPTOPP_ASSERT(0);
-    return _mm_setzero_si128();
-}
-
-template <>
-inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    const __m128i r1 = _mm_unpacklo_epi32(a, b);
-    const __m128i r2 = _mm_unpacklo_epi32(c, d);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
-        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    const __m128i r1 = _mm_unpacklo_epi32(a, b);
-    const __m128i r2 = _mm_unpacklo_epi32(c, d);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
-        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    const __m128i r1 = _mm_unpackhi_epi32(a, b);
-    const __m128i r2 = _mm_unpackhi_epi32(c, d);
-    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
-        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    const __m128i r1 = _mm_unpackhi_epi32(a, b);
-    const __m128i r2 = _mm_unpackhi_epi32(c, d);
-    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
-        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
-}
-
-/// \brief Unpack a XMM word
-/// \tparam IDX the element from each XMM word
-/// \param v the first XMM word
-/// \details UnpackXMM selects the IDX element from v and returns a concatenation
-///   equivalent to <tt>v[IDX] || v[IDX] || v[IDX] || v[IDX]</tt>.
-template <unsigned int IDX>
-inline __m128i UnpackXMM(const __m128i& v)
-{
-    // Should not be instantiated
-    CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
-    return _mm_setzero_si128();
-}
-
-template <>
-inline __m128i UnpackXMM<0>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
-}
-
-template <>
-inline __m128i UnpackXMM<1>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
-}
-
-template <>
-inline __m128i UnpackXMM<2>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
-}
-
-template <>
-inline __m128i UnpackXMM<3>(const __m128i& v)
-{
-    return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
-}
-
-template <unsigned int IDX>
-inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    return UnpackXMM<IDX>(a, b, c, d);
-}
-
-template <unsigned int IDX>
-inline __m128i RepackXMM(const __m128i& v)
-{
-    return UnpackXMM<IDX>(v);
-}
-
-inline void SIMECK64_Encrypt(__m128i &a, __m128i &b, __m128i &c, __m128i &d, const __m128i key)
-{
-    // SunStudio 12.3 workaround
-    __m128i s, t; s = a; t = c;
-    a = _mm_xor_si128(_mm_and_si128(a, RotateLeft32<5>(a)), RotateLeft32<1>(a));
-    c = _mm_xor_si128(_mm_and_si128(c, RotateLeft32<5>(c)), RotateLeft32<1>(c));
-    a = _mm_xor_si128(a, _mm_xor_si128(b, key));
-    c = _mm_xor_si128(c, _mm_xor_si128(d, key));
-    b = s; d = t;
-}
-
-inline void SIMECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
-    __m128i a = UnpackXMM<0>(block0);
-    __m128i b = UnpackXMM<1>(block0);
-    __m128i c = UnpackXMM<2>(block0);
-    __m128i d = UnpackXMM<3>(block0);
-
-    const unsigned int rounds = 44;
-    for (int i = 0; i < static_cast<int>(rounds); i += 4)
-    {
-        const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
-    }
-
-    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
-    block0 = RepackXMM<0>(a,b,c,d);
-}
-
-inline void SIMECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int /*rounds*/)
-{
-    // SIMECK requires a word swap for the decryption transform
-    __m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1));
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
-    __m128i a = UnpackXMM<0>(w);
-    __m128i b = UnpackXMM<1>(w);
-    __m128i c = UnpackXMM<2>(w);
-    __m128i d = UnpackXMM<3>(w);
-
-    const unsigned int rounds = 44;
-    for (int i = static_cast<int>(rounds)-1; i >= 0; i -= 4)
-    {
-        const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
-    }
-
-    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
-    w = RepackXMM<0>(a,b,c,d);
-
-    block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
-}
-
-inline void SIMECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
-    __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
-    __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
-    __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
-    __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
-
-    const unsigned int rounds = 44;
-    for (int i = 0; i < static_cast<int>(rounds); i += 4)
-    {
-        const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
-    }
-
-    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
-    block0 = RepackXMM<0>(a, b, c, d);
-    block1 = RepackXMM<1>(a, b, c, d);
-    block2 = RepackXMM<2>(a, b, c, d);
-    block3 = RepackXMM<3>(a, b, c, d);
-}
-
-inline void SIMECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
-{
-    // SIMECK requires a word swap for the decryption transform
-    __m128i w = _mm_shuffle_epi32(block0, _MM_SHUFFLE(2, 3, 0, 1));
-    __m128i x = _mm_shuffle_epi32(block1, _MM_SHUFFLE(2, 3, 0, 1));
-    __m128i y = _mm_shuffle_epi32(block2, _MM_SHUFFLE(2, 3, 0, 1));
-    __m128i z = _mm_shuffle_epi32(block3, _MM_SHUFFLE(2, 3, 0, 1));
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
-    __m128i a = UnpackXMM<0>(w, x, y, z);
-    __m128i b = UnpackXMM<1>(w, x, y, z);
-    __m128i c = UnpackXMM<2>(w, x, y, z);
-    __m128i d = UnpackXMM<3>(w, x, y, z);
-
-    const unsigned int rounds = 44;
-    for (int i = static_cast<int>(rounds)-1; i >= 0; i -= 4)
-    {
-        const __m128i key = _mm_loadu_si128(CONST_M128_CAST(subkeys + i - 3));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(3, 3, 3, 3)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(2, 2, 2, 2)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(1, 1, 1, 1)));
-        SIMECK64_Encrypt(a, b, c, d, _mm_shuffle_epi32(key, _MM_SHUFFLE(0, 0, 0, 0)));
-    }
-
-    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
-    w = RepackXMM<0>(a, b, c, d);
-    x = RepackXMM<1>(a, b, c, d);
-    y = RepackXMM<2>(a, b, c, d);
-    z = RepackXMM<3>(a, b, c, d);
-
-    block0 = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
-    block1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
-    block2 = _mm_shuffle_epi32(y, _MM_SHUFFLE(2, 3, 0, 1));
-    block3 = _mm_shuffle_epi32(z, _MM_SHUFFLE(2, 3, 0, 1));
-}
-
-#endif  // CRYPTOPP_SSSE3_AVAILABLE
-
-ANONYMOUS_NAMESPACE_END
-
-NAMESPACE_BEGIN(CryptoPP)
-
-#if defined(CRYPTOPP_SSSE3_AVAILABLE)
-size_t SIMECK64_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Enc_Block, SIMECK64_Enc_4_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMECK64_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_4x1_SSE(SIMECK64_Dec_Block, SIMECK64_Dec_4_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif // CRYPTOPP_SSSE3_AVAILABLE
-
-NAMESPACE_END
diff --git a/simon.cpp b/simon.cpp
index f00ba3a0..a0eacea4 100644
--- a/simon.cpp
+++ b/simon.cpp
@@ -196,14 +196,6 @@ ANONYMOUS_NAMESPACE_END
 
 NAMESPACE_BEGIN(CryptoPP)
 
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-extern size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-#endif
-
 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
 extern size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@@ -212,14 +204,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, siz
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif
 
-#if (CRYPTOPP_SSE41_AVAILABLE)
-extern size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-#endif
-
 #if (CRYPTOPP_SSSE3_AVAILABLE)
 extern size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@@ -228,14 +212,6 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif
 
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-extern size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-#endif
-
 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
 extern size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@@ -246,39 +222,11 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys,
 
 std::string SIMON64::Base::AlgorithmProvider() const
 {
-#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
-# if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return "SSE4.1";
-# endif
-# if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return "NEON";
-# endif
-# if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return "Altivec";
-# endif
-#endif
     return "C++";
 }
 
 unsigned int SIMON64::Base::OptimalDataAlignment() const
 {
-#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
-# if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return 16;  // load __m128i
-# endif
-# if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return 4;  // load uint32x4_t
-# endif
-# if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return 16;  // load uint32x4_p
-# endif
-#endif
     return GetAlignmentOf<word32>();
 }
 
@@ -311,29 +259,6 @@ void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
     default:
         CRYPTOPP_ASSERT(0);
     }
-
-#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-
-    // Pre-splat the round keys for Altivec forward transformation
-#if CRYPTOPP_ALTIVEC_AVAILABLE
-    if (IsForwardTransformation() && HasAltivec())
-    {
-        AlignedSecBlock presplat(m_rkeys.size()*4);
-        for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
-            presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
-        m_rkeys.swap(presplat);
-    }
-#elif CRYPTOPP_SSE41_AVAILABLE
-    if (IsForwardTransformation() && HasSSE41())
-    {
-        AlignedSecBlock presplat(m_rkeys.size()*4);
-        for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
-            presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
-        m_rkeys.swap(presplat);
-    }
-#endif
-
-#endif  // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
 }
 
 void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@@ -478,7 +403,7 @@ void SIMON128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength
     }
 #endif
 
-#endif  // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
+#endif  // CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
 }
 
 void SIMON128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@@ -533,50 +458,6 @@ void SIMON128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
     OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
 }
 
-#if (CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
-size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-#if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return SIMON64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-#if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return SIMON64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif  // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-
 #if (CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS)
 size_t SIMON128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
         byte *outBlocks, size_t length, word32 flags) const
diff --git a/simon.h b/simon.h
index b85512f6..402c0121 100644
--- a/simon.h
+++ b/simon.h
@@ -17,14 +17,6 @@
 #include "seckey.h"
 #include "secblock.h"
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
-    CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
-    CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
-# ifndef CRYPTOPP_DISABLE_SIMON_SIMD
-#  define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1
-# endif
-#endif
-
 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
     CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
     CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
@@ -36,13 +28,9 @@
 // Yet another SunStudio/SunCC workaround. Failed self tests
 // in SSE code paths on i386 for SunStudio 12.3 and below.
 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-# undef CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
 # undef CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
 #endif
 
-// https://github.com/weidai11/cryptopp/issues/945
-#undef CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_BEGIN(CryptoPP)
 
 /// \brief SIMON block cipher information
@@ -129,9 +117,6 @@ public:
     {
     public:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
     };
 
     /// \brief SIMON64 decryption transformation
@@ -142,9 +127,6 @@ public:
     {
     public:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
     };
 
     typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
diff --git a/simon64_simd.cpp b/simon64_simd.cpp
deleted file mode 100644
index 0702d782..00000000
--- a/simon64_simd.cpp
+++ /dev/null
@@ -1,864 +0,0 @@
-// simon_simd.cpp - written and placed in the public domain by Jeffrey Walton
-//
-//    This source file uses intrinsics and built-ins to gain access to
-//    SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
-//    source file is needed because additional CXXFLAGS are required to enable
-//    the appropriate instructions sets in some build configurations.
-
-#include "pch.h"
-#include "config.h"
-
-#include "simon.h"
-#include "misc.h"
-
-// Uncomment for benchmarking C++ against SSE or NEON.
-// Do so in both simon.cpp and simon_simd.cpp.
-// #undef CRYPTOPP_SSE41_AVAILABLE
-// #undef CRYPTOPP_ARM_NEON_AVAILABLE
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-# include "adv_simd.h"
-# include <pmmintrin.h>
-# include <tmmintrin.h>
-# include <smmintrin.h>
-#endif
-
-#if defined(__XOP__)
-# include <ammintrin.h>
-# if defined(__GNUC__)
-#  include <x86intrin.h>
-# endif
-#endif
-
-#if (CRYPTOPP_ARM_NEON_HEADER)
-# include "adv_simd.h"
-# include <arm_neon.h>
-#endif
-
-#if (CRYPTOPP_ARM_ACLE_HEADER)
-# include <stdint.h>
-# include <arm_acle.h>
-#endif
-
-#if defined(_M_ARM64)
-# include "adv_simd.h"
-#endif
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-# include "adv_simd.h"
-# include "ppc_simd.h"
-#endif
-
-// Squash MS LNK4221 and libtool warnings
-extern const char SIMON64_SIMD_FNAME[] = __FILE__;
-
-ANONYMOUS_NAMESPACE_BEGIN
-
-using CryptoPP::byte;
-using CryptoPP::word32;
-using CryptoPP::word64;
-using CryptoPP::vec_swap;  // SunCC
-
-// *************************** ARM NEON ************************** //
-
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-
-template <class T>
-inline T UnpackHigh32(const T& a, const T& b)
-{
-    const uint32x2_t x(vget_high_u32((uint32x4_t)a));
-    const uint32x2_t y(vget_high_u32((uint32x4_t)b));
-    const uint32x2x2_t r = vzip_u32(x, y);
-    return (T)vcombine_u32(r.val[0], r.val[1]);
-}
-
-template <class T>
-inline T UnpackLow32(const T& a, const T& b)
-{
-    const uint32x2_t x(vget_low_u32((uint32x4_t)a));
-    const uint32x2_t y(vget_low_u32((uint32x4_t)b));
-    const uint32x2x2_t r = vzip_u32(x, y);
-    return (T)vcombine_u32(r.val[0], r.val[1]);
-}
-
-template <unsigned int R>
-inline uint32x4_t RotateLeft32(const uint32x4_t& val)
-{
-    const uint32x4_t a(vshlq_n_u32(val, R));
-    const uint32x4_t b(vshrq_n_u32(val, 32 - R));
-    return vorrq_u32(a, b);
-}
-
-template <unsigned int R>
-inline uint32x4_t RotateRight32(const uint32x4_t& val)
-{
-    const uint32x4_t a(vshlq_n_u32(val, 32 - R));
-    const uint32x4_t b(vshrq_n_u32(val, R));
-    return vorrq_u32(a, b);
-}
-
-#if defined(__aarch32__) || defined(__aarch64__)
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
-{
-    const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
-    const uint8x16_t mask = vld1q_u8(maskb);
-
-    return vreinterpretq_u32_u8(
-        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
-{
-    const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
-    const uint8x16_t mask = vld1q_u8(maskb);
-
-    return vreinterpretq_u32_u8(
-        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
-}
-#endif
-
-inline uint32x4_t SIMON64_f(const uint32x4_t& val)
-{
-    return veorq_u32(RotateLeft32<2>(val),
-        vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
-}
-
-inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
-
-        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
-        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
-
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
-        std::swap(x1, y1);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-}
-
-inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-
-    if (rounds & 1)
-    {
-        std::swap(x1, y1);
-        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
-
-        y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
-        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
-
-        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-}
-
-inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
-    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
-    {
-        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
-        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
-        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
-
-        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
-        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
-        x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
-        x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
-
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
-        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
-        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
-        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-    block2 = UnpackLow32(y2, x2);
-    block3 = UnpackHigh32(y2, x2);
-    block4 = UnpackLow32(y3, x3);
-    block5 = UnpackHigh32(y3, x3);
-}
-
-inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
-    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
-
-    if (rounds & 1)
-    {
-        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
-        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
-
-        y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
-        y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
-        y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-        const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
-        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
-        x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
-        x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
-
-        const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
-        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
-        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
-        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-    block2 = UnpackLow32(y2, x2);
-    block3 = UnpackHigh32(y2, x2);
-    block4 = UnpackLow32(y3, x3);
-    block5 = UnpackHigh32(y3, x3);
-}
-
-#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
-
-// ***************************** IA-32 ***************************** //
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-
-// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
-#ifndef M128_CAST
-# define M128_CAST(x) ((__m128i *)(void *)(x))
-#endif
-#ifndef CONST_M128_CAST
-# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
-#endif
-
-inline void Swap128(__m128i& a,__m128i& b)
-{
-#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-    // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
-    // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
-    vec_swap(a, b);
-#else
-    std::swap(a, b);
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateLeft32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateRight32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-__m128i RotateLeft32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 8);
-#else
-    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-__m128i RotateRight32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-8);
-#else
-    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-inline __m128i SIMON64_f(const __m128i& v)
-{
-    return _mm_xor_si128(RotateLeft32<2>(v),
-        _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
-}
-
-inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
-
-        const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4));
-        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4));
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
-        Swap128(x1, y1);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-}
-
-inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    if (rounds & 1)
-    {
-        Swap128(x1, y1);
-        const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-        const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
-        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
-
-        const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-}
-
-inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t2 = _mm_castsi128_ps(block2);
-    const __m128 t3 = _mm_castsi128_ps(block3);
-    __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t4 = _mm_castsi128_ps(block4);
-    const __m128 t5 = _mm_castsi128_ps(block5);
-    __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk1 = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
-        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
-        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
-
-        const __m128i rk2 = _mm_load_si128(CONST_M128_CAST(subkeys+(i+1)*4));
-        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
-        x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
-        x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+(rounds-1)*4));
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
-        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
-        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
-        Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-    block2 = _mm_unpacklo_epi32(y2, x2);
-    block3 = _mm_unpackhi_epi32(y2, x2);
-    block4 = _mm_unpacklo_epi32(y3, x3);
-    block5 = _mm_unpackhi_epi32(y3, x3);
-}
-
-inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t2 = _mm_castsi128_ps(block2);
-    const __m128 t3 = _mm_castsi128_ps(block3);
-    __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t4 = _mm_castsi128_ps(block4);
-    const __m128 t5 = _mm_castsi128_ps(block5);
-    __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
-
-    if (rounds & 1)
-    {
-        Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
-        const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
-        y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
-        y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-        const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
-        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
-        x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
-        x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
-
-        const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
-        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
-        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
-        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-    block2 = _mm_unpacklo_epi32(y2, x2);
-    block3 = _mm_unpackhi_epi32(y2, x2);
-    block4 = _mm_unpacklo_epi32(y3, x3);
-    block5 = _mm_unpackhi_epi32(y3, x3);
-}
-
-#endif  // CRYPTOPP_SSE41_AVAILABLE
-
-// ***************************** Altivec ***************************** //
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-
-using CryptoPP::uint8x16_p;
-using CryptoPP::uint32x4_p;
-
-using CryptoPP::VecAnd;
-using CryptoPP::VecXor;
-using CryptoPP::VecLoad;
-using CryptoPP::VecLoadAligned;
-using CryptoPP::VecPermute;
-
-// Rotate left by bit count
-template<unsigned int C>
-inline uint32x4_p RotateLeft32(const uint32x4_p val)
-{
-    const uint32x4_p m = {C, C, C, C};
-    return vec_rl(val, m);
-}
-
-// Rotate right by bit count
-template<unsigned int C>
-inline uint32x4_p RotateRight32(const uint32x4_p val)
-{
-    const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
-    return vec_rl(val, m);
-}
-
-inline uint32x4_p SIMON64_f(const uint32x4_p val)
-{
-    return VecXor(RotateLeft32<2>(val),
-        VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
-}
-
-inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = VecPermute(block0, block1, m1);
-    uint32x4_p y1 = VecPermute(block0, block1, m2);
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4);
-        const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
-        x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
-        std::swap(x1, y1);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-}
-
-inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = VecPermute(block0, block1, m1);
-    uint32x4_p y1 = VecPermute(block0, block1, m2);
-
-    if (rounds & 1)
-    {
-        std::swap(x1, y1);
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
-#else
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk = VecLoad(subkeys+rounds-1);
-        rk = VecPermute(rk, rk, m);
-#endif
-        y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
-        const uint32x4_p rk2 = vec_splats(subkeys[i]);
-#else
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk1 = VecLoad(subkeys+i+1);
-        uint32x4_p rk2 = VecLoad(subkeys+i);
-        rk1 = VecPermute(rk1, rk1, m);
-        rk2 = VecPermute(rk2, rk2, m);
-#endif
-        x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-}
-
-inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
-            uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
-            uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
-    uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
-    uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
-    uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
-    uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
-    uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
-
-    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk1 = VecLoadAligned(subkeys+i*4);
-        const uint32x4_p rk2 = VecLoadAligned(subkeys+(i+1)*4);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
-        y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
-        y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
-
-        x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
-        x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
-        x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
-    }
-
-    if (rounds & 1)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk = VecLoadAligned(subkeys+(rounds-1)*4);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
-        y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
-        y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
-        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-    block2 = (uint32x4_p)VecPermute(x2, y2, m3);
-    block3 = (uint32x4_p)VecPermute(x2, y2, m4);
-    block4 = (uint32x4_p)VecPermute(x3, y3, m3);
-    block5 = (uint32x4_p)VecPermute(x3, y3, m4);
-}
-
-inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
-            uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
-            uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
-    uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
-    uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
-    uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
-    uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
-    uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
-
-    if (rounds & 1)
-    {
-        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
-#else
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk = VecLoad(subkeys+rounds-1);
-        rk = VecPermute(rk, rk, m);
-#endif
-        y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
-        y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
-        y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
-        rounds--;
-    }
-
-    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
-    {
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
-        const uint32x4_p rk2 = vec_splats(subkeys[i]);
-#else
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk1 = VecLoad(subkeys+i+1);
-        uint32x4_p rk2 = VecLoad(subkeys+i);
-        rk1 = VecPermute(rk1, rk1, m);
-        rk2 = VecPermute(rk2, rk2, m);
-#endif
-        x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
-        x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
-        x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
-
-        y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
-        y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
-        y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-    block2 = (uint32x4_p)VecPermute(x2, y2, m3);
-    block3 = (uint32x4_p)VecPermute(x2, y2, m4);
-    block4 = (uint32x4_p)VecPermute(x3, y3, m3);
-    block5 = (uint32x4_p)VecPermute(x3, y3, m4);
-}
-
-#endif  // CRYPTOPP_ALTIVEC_AVAILABLE
-
-ANONYMOUS_NAMESPACE_END
-
-///////////////////////////////////////////////////////////////////////
-
-NAMESPACE_BEGIN(CryptoPP)
-
-// *************************** ARM NEON **************************** //
-
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
-
-// ***************************** IA-32 ***************************** //
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-// ***************************** Altivec ***************************** //
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-NAMESPACE_END
diff --git a/speck.cpp b/speck.cpp
index 879dfc16..00b960d7 100644
--- a/speck.cpp
+++ b/speck.cpp
@@ -171,12 +171,6 @@ ANONYMOUS_NAMESPACE_END
 NAMESPACE_BEGIN(CryptoPP)
 
 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
-extern size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
 extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 
@@ -200,14 +194,6 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif
 
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-extern size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-
-extern size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
-#endif
-
 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
 extern size_t SPECK128_Enc_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@@ -218,39 +204,11 @@ extern size_t SPECK128_Dec_AdvancedProcessBlocks_ALTIVEC(const word64* subKeys,
 
 std::string SPECK64::Base::AlgorithmProvider() const
 {
-#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
-# if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return "SSE4.1";
-# endif
-# if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return "NEON";
-# endif
-# if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return "Altivec";
-# endif
-#endif
     return "C++";
 }
 
 unsigned int SPECK64::Base::OptimalDataAlignment() const
 {
-#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
-# if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return 16;  // load __m128i
-# endif
-# if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return 4;  // load uint32x4_t
-# endif
-# if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return 16;  // load uint32x4_p
-# endif
-#endif
     return GetAlignmentOf<word32>();
 }
 
@@ -283,29 +241,6 @@ void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
     default:
         CRYPTOPP_ASSERT(0);
     }
-
-#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-
-    // Pre-splat the round keys for Altivec forward transformation
-#if CRYPTOPP_ALTIVEC_AVAILABLE
-    if (IsForwardTransformation() && HasAltivec())
-    {
-        AlignedSecBlock presplat(m_rkeys.size()*4);
-        for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
-            presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
-        m_rkeys.swap(presplat);
-    }
-#elif CRYPTOPP_SSE41_AVAILABLE
-    if (IsForwardTransformation() && HasSSE41())
-    {
-        AlignedSecBlock presplat(m_rkeys.size()*4);
-        for (size_t i=0, j=0; i<m_rkeys.size(); i++, j+=4)
-            presplat[j+0] = presplat[j+1] = presplat[j+2] = presplat[j+3] = m_rkeys[i];
-        m_rkeys.swap(presplat);
-    }
-#endif
-
-#endif  // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
 }
 
 void SPECK64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@@ -505,50 +440,6 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
     OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
 }
 
-#if (CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
-size_t SPECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-#if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return SPECK64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return SPECK64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SPECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
-        byte *outBlocks, size_t length, word32 flags) const
-{
-#if (CRYPTOPP_SSE41_AVAILABLE)
-    if (HasSSE41())
-        return SPECK64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-    if (HasNEON())
-        return SPECK64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-    if (HasAltivec())
-        return SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
-            inBlocks, xorBlocks, outBlocks, length, flags);
-#endif
-    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif  // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-
 #if (CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS)
 size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
         byte *outBlocks, size_t length, word32 flags) const
diff --git a/speck.h b/speck.h
index 4a06729d..a08bc9c8 100644
--- a/speck.h
+++ b/speck.h
@@ -17,14 +17,6 @@
 #include "seckey.h"
 #include "secblock.h"
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
-    CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
-    CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
-# ifndef CRYPTOPP_DISABLE_SPECK_SIMD
-#  define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1
-# endif
-#endif
-
 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || \
     CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8 || \
     CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
@@ -36,13 +28,9 @@
 // Yet another SunStudio/SunCC workaround. Failed self tests
 // in SSE code paths on i386 for SunStudio 12.3 and below.
 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
-# undef CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
 # undef CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS
 #endif
 
-// https://github.com/weidai11/cryptopp/issues/945
-#undef CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-
 NAMESPACE_BEGIN(CryptoPP)
 
 /// \brief SPECK block cipher information
@@ -129,9 +117,6 @@ public:
     {
     public:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
     };
 
     /// \brief SPECK64 decryption transformation
@@ -142,9 +127,6 @@ public:
     {
     public:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
-        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
-#endif
     };
 
     typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
diff --git a/speck64_simd.cpp b/speck64_simd.cpp
deleted file mode 100644
index 204c0dca..00000000
--- a/speck64_simd.cpp
+++ /dev/null
@@ -1,781 +0,0 @@
-// speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
-//
-//    This source file uses intrinsics and built-ins to gain access to
-//    SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
-//    source file is needed because additional CXXFLAGS are required to enable
-//    the appropriate instructions sets in some build configurations.
-
-#include "pch.h"
-#include "config.h"
-
-#include "speck.h"
-#include "misc.h"
-
-// Uncomment for benchmarking C++ against SSE or NEON.
-// Do so in both speck.cpp and speck_simd.cpp.
-// #undef CRYPTOPP_SSE41_AVAILABLE
-// #undef CRYPTOPP_ARM_NEON_AVAILABLE
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-# include "adv_simd.h"
-# include <pmmintrin.h>
-# include <tmmintrin.h>
-# include <smmintrin.h>
-#endif
-
-#if defined(__XOP__)
-# include <ammintrin.h>
-# if defined(__GNUC__)
-#  include <x86intrin.h>
-# endif
-#endif
-
-#if (CRYPTOPP_ARM_NEON_HEADER)
-# include "adv_simd.h"
-# include <arm_neon.h>
-#endif
-
-#if (CRYPTOPP_ARM_ACLE_HEADER)
-# include <stdint.h>
-# include <arm_acle.h>
-#endif
-
-#if defined(_M_ARM64)
-# include "adv_simd.h"
-#endif
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-# include "adv_simd.h"
-# include "ppc_simd.h"
-#endif
-
-// Squash MS LNK4221 and libtool warnings
-extern const char SPECK64_SIMD_FNAME[] = __FILE__;
-
-ANONYMOUS_NAMESPACE_BEGIN
-
-using CryptoPP::byte;
-using CryptoPP::word32;
-using CryptoPP::word64;
-
-// *************************** ARM NEON ************************** //
-
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-
-template <class T>
-inline T UnpackHigh32(const T& a, const T& b)
-{
-    const uint32x2_t x(vget_high_u32((uint32x4_t)a));
-    const uint32x2_t y(vget_high_u32((uint32x4_t)b));
-    const uint32x2x2_t r = vzip_u32(x, y);
-    return (T)vcombine_u32(r.val[0], r.val[1]);
-}
-
-template <class T>
-inline T UnpackLow32(const T& a, const T& b)
-{
-    const uint32x2_t x(vget_low_u32((uint32x4_t)a));
-    const uint32x2_t y(vget_low_u32((uint32x4_t)b));
-    const uint32x2x2_t r = vzip_u32(x, y);
-    return (T)vcombine_u32(r.val[0], r.val[1]);
-}
-
-template <unsigned int R>
-inline uint32x4_t RotateLeft32(const uint32x4_t& val)
-{
-    const uint32x4_t a(vshlq_n_u32(val, R));
-    const uint32x4_t b(vshrq_n_u32(val, 32 - R));
-    return vorrq_u32(a, b);
-}
-
-template <unsigned int R>
-inline uint32x4_t RotateRight32(const uint32x4_t& val)
-{
-    const uint32x4_t a(vshlq_n_u32(val, 32 - R));
-    const uint32x4_t b(vshrq_n_u32(val, R));
-    return vorrq_u32(a, b);
-}
-
-#if defined(__aarch32__) || defined(__aarch64__)
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
-{
-    const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
-    const uint8x16_t mask = vld1q_u8(maskb);
-
-    return vreinterpretq_u32_u8(
-        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
-{
-    const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
-    const uint8x16_t mask = vld1q_u8(maskb);
-
-    return vreinterpretq_u32_u8(
-        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
-}
-#endif  // Aarch32 or Aarch64
-
-inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
-
-        x1 = RotateRight32<8>(x1);
-        x1 = vaddq_u32(x1, y1);
-        x1 = veorq_u32(x1, rk);
-        y1 = RotateLeft32<3>(y1);
-        y1 = veorq_u32(y1, x1);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-}
-
-inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
-
-        y1 = veorq_u32(y1, x1);
-        y1 = RotateRight32<3>(y1);
-        x1 = veorq_u32(x1, rk);
-        x1 = vsubq_u32(x1, y1);
-        x1 = RotateLeft32<8>(x1);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-}
-
-inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
-    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
-
-        x1 = RotateRight32<8>(x1);
-        x2 = RotateRight32<8>(x2);
-        x3 = RotateRight32<8>(x3);
-        x1 = vaddq_u32(x1, y1);
-        x2 = vaddq_u32(x2, y2);
-        x3 = vaddq_u32(x3, y3);
-        x1 = veorq_u32(x1, rk);
-        x2 = veorq_u32(x2, rk);
-        x3 = veorq_u32(x3, rk);
-        y1 = RotateLeft32<3>(y1);
-        y2 = RotateLeft32<3>(y2);
-        y3 = RotateLeft32<3>(y3);
-        y1 = veorq_u32(y1, x1);
-        y2 = veorq_u32(y2, x2);
-        y3 = veorq_u32(y3, x3);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-    block2 = UnpackLow32(y2, x2);
-    block3 = UnpackHigh32(y2, x2);
-    block4 = UnpackLow32(y3, x3);
-    block5 = UnpackHigh32(y3, x3);
-}
-
-inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
-    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
-
-        y1 = veorq_u32(y1, x1);
-        y2 = veorq_u32(y2, x2);
-        y3 = veorq_u32(y3, x3);
-        y1 = RotateRight32<3>(y1);
-        y2 = RotateRight32<3>(y2);
-        y3 = RotateRight32<3>(y3);
-        x1 = veorq_u32(x1, rk);
-        x2 = veorq_u32(x2, rk);
-        x3 = veorq_u32(x3, rk);
-        x1 = vsubq_u32(x1, y1);
-        x2 = vsubq_u32(x2, y2);
-        x3 = vsubq_u32(x3, y3);
-        x1 = RotateLeft32<8>(x1);
-        x2 = RotateLeft32<8>(x2);
-        x3 = RotateLeft32<8>(x3);
-    }
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = UnpackLow32(y1, x1);
-    block1 = UnpackHigh32(y1, x1);
-    block2 = UnpackLow32(y2, x2);
-    block3 = UnpackHigh32(y2, x2);
-    block4 = UnpackLow32(y3, x3);
-    block5 = UnpackHigh32(y3, x3);
-}
-
-#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
-
-// ***************************** IA-32 ***************************** //
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-
-#ifndef M128_CAST
-# define M128_CAST(x) ((__m128i *)(void *)(x))
-#endif
-#ifndef CONST_M128_CAST
-# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
-#endif
-
-template <unsigned int R>
-inline __m128i RotateLeft32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
-#endif
-}
-
-template <unsigned int R>
-inline __m128i RotateRight32(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-R);
-#else
-    return _mm_or_si128(
-        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-__m128i RotateLeft32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 8);
-#else
-    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
-template <>
-__m128i RotateRight32<8>(const __m128i& val)
-{
-#if defined(__XOP__)
-    return _mm_roti_epi32(val, 32-8);
-#else
-    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
-    return _mm_shuffle_epi8(val, mask);
-#endif
-}
-
-inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
-
-        x1 = RotateRight32<8>(x1);
-        x1 = _mm_add_epi32(x1, y1);
-        x1 = _mm_xor_si128(x1, rk);
-        y1 = RotateLeft32<3>(y1);
-        y1 = _mm_xor_si128(y1, x1);
-    }
-
-    // The is roughly the SSE equivalent to ARM vzp32
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-}
-
-inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-        const __m128i rk = _mm_set1_epi32(subkeys[i]);
-
-        y1 = _mm_xor_si128(y1, x1);
-        y1 = RotateRight32<3>(y1);
-        x1 = _mm_xor_si128(x1, rk);
-        x1 = _mm_sub_epi32(x1, y1);
-        x1 = RotateLeft32<8>(x1);
-    }
-
-    // The is roughly the SSE equivalent to ARM vzp32
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-}
-
-inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t2 = _mm_castsi128_ps(block2);
-    const __m128 t3 = _mm_castsi128_ps(block3);
-    __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t4 = _mm_castsi128_ps(block4);
-    const __m128 t5 = _mm_castsi128_ps(block5);
-    __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        // Round keys are pre-splated in forward direction
-        const __m128i rk = _mm_load_si128(CONST_M128_CAST(subkeys+i*4));
-
-        x1 = RotateRight32<8>(x1);
-        x2 = RotateRight32<8>(x2);
-        x3 = RotateRight32<8>(x3);
-        x1 = _mm_add_epi32(x1, y1);
-        x2 = _mm_add_epi32(x2, y2);
-        x3 = _mm_add_epi32(x3, y3);
-        x1 = _mm_xor_si128(x1, rk);
-        x2 = _mm_xor_si128(x2, rk);
-        x3 = _mm_xor_si128(x3, rk);
-        y1 = RotateLeft32<3>(y1);
-        y2 = RotateLeft32<3>(y2);
-        y3 = RotateLeft32<3>(y3);
-        y1 = _mm_xor_si128(y1, x1);
-        y2 = _mm_xor_si128(y2, x2);
-        y3 = _mm_xor_si128(y3, x3);
-    }
-
-    // The is roughly the SSE equivalent to ARM vzp32
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-    block2 = _mm_unpacklo_epi32(y2, x2);
-    block3 = _mm_unpackhi_epi32(y2, x2);
-    block4 = _mm_unpacklo_epi32(y3, x3);
-    block5 = _mm_unpackhi_epi32(y3, x3);
-}
-
-inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
-    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
-    const word32 *subkeys, unsigned int rounds)
-{
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    const __m128 t0 = _mm_castsi128_ps(block0);
-    const __m128 t1 = _mm_castsi128_ps(block1);
-    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t2 = _mm_castsi128_ps(block2);
-    const __m128 t3 = _mm_castsi128_ps(block3);
-    __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
-
-    const __m128 t4 = _mm_castsi128_ps(block4);
-    const __m128 t5 = _mm_castsi128_ps(block5);
-    __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
-    __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-        const __m128i rk = _mm_set1_epi32(subkeys[i]);
-
-        y1 = _mm_xor_si128(y1, x1);
-        y2 = _mm_xor_si128(y2, x2);
-        y3 = _mm_xor_si128(y3, x3);
-        y1 = RotateRight32<3>(y1);
-        y2 = RotateRight32<3>(y2);
-        y3 = RotateRight32<3>(y3);
-        x1 = _mm_xor_si128(x1, rk);
-        x2 = _mm_xor_si128(x2, rk);
-        x3 = _mm_xor_si128(x3, rk);
-        x1 = _mm_sub_epi32(x1, y1);
-        x2 = _mm_sub_epi32(x2, y2);
-        x3 = _mm_sub_epi32(x3, y3);
-        x1 = RotateLeft32<8>(x1);
-        x2 = RotateLeft32<8>(x2);
-        x3 = RotateLeft32<8>(x3);
-    }
-
-    // The is roughly the SSE equivalent to ARM vzp32
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = _mm_unpacklo_epi32(y1, x1);
-    block1 = _mm_unpackhi_epi32(y1, x1);
-    block2 = _mm_unpacklo_epi32(y2, x2);
-    block3 = _mm_unpackhi_epi32(y2, x2);
-    block4 = _mm_unpacklo_epi32(y3, x3);
-    block5 = _mm_unpackhi_epi32(y3, x3);
-}
-
-#endif  // CRYPTOPP_SSE41_AVAILABLE
-
-// ***************************** Altivec ***************************** //
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-using CryptoPP::uint8x16_p;
-using CryptoPP::uint32x4_p;
-
-using CryptoPP::VecAdd;
-using CryptoPP::VecSub;
-using CryptoPP::VecXor;
-using CryptoPP::VecLoad;
-using CryptoPP::VecLoadAligned;
-using CryptoPP::VecPermute;
-
-// Rotate left by bit count
-template<unsigned int C>
-inline uint32x4_p RotateLeft32(const uint32x4_p val)
-{
-    const uint32x4_p m = {C, C, C, C};
-    return vec_rl(val, m);
-}
-
-// Rotate right by bit count
-template<unsigned int C>
-inline uint32x4_p RotateRight32(const uint32x4_p val)
-{
-    const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
-    return vec_rl(val, m);
-}
-
-void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
-        const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = VecPermute(block0, block1, m1);
-    uint32x4_p y1 = VecPermute(block0, block1, m2);
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk = VecLoadAligned(subkeys+i*4);
-
-        x1 = RotateRight32<8>(x1);
-        x1 = VecAdd(x1, y1);
-        x1 = VecXor(x1, rk);
-
-        y1 = RotateLeft32<3>(y1);
-        y1 = VecXor(y1, x1);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-}
-
-void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
-        const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = VecPermute(block0, block1, m1);
-    uint32x4_p y1 = VecPermute(block0, block1, m2);
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk = vec_splats(subkeys[i]);
-#else
-        // subkeys has extra elements so memory backs the last subkey
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk = VecLoad(subkeys+i);
-        rk = VecPermute(rk, rk, m);
-#endif
-
-        y1 = VecXor(y1, x1);
-        y1 = RotateRight32<3>(y1);
-
-        x1 = VecXor(x1, rk);
-        x1 = VecSub(x1, y1);
-        x1 = RotateLeft32<8>(x1);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-}
-
-void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
-            uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
-            uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
-    uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
-    uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
-    uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
-    uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
-    uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
-
-    for (size_t i=0; i < static_cast<size_t>(rounds); ++i)
-    {
-        // Round keys are pre-splated in forward direction
-        const uint32x4_p rk = VecLoadAligned(subkeys+i*4);
-
-        x1 = RotateRight32<8>(x1);
-        x2 = RotateRight32<8>(x2);
-        x3 = RotateRight32<8>(x3);
-
-        x1 = VecAdd(x1, y1);
-        x2 = VecAdd(x2, y2);
-        x3 = VecAdd(x3, y3);
-
-        x1 = VecXor(x1, rk);
-        x2 = VecXor(x2, rk);
-        x3 = VecXor(x3, rk);
-
-        y1 = RotateLeft32<3>(y1);
-        y2 = RotateLeft32<3>(y2);
-        y3 = RotateLeft32<3>(y3);
-
-        y1 = VecXor(y1, x1);
-        y2 = VecXor(y2, x2);
-        y3 = VecXor(y3, x3);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-    block2 = (uint32x4_p)VecPermute(x2, y2, m3);
-    block3 = (uint32x4_p)VecPermute(x2, y2, m4);
-    block4 = (uint32x4_p)VecPermute(x3, y3, m3);
-    block5 = (uint32x4_p)VecPermute(x3, y3, m4);
-}
-
-void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
-            uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
-            uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
-{
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-    const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-#else
-    const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
-    const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
-#endif
-
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
-    uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
-    uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
-    uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
-    uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
-    uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
-
-    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
-    {
-#if defined(_ARCH_PWR7)
-        const uint32x4_p rk = vec_splats(subkeys[i]);
-#else
-        // subkeys has extra elements so memory backs the last subkey
-        const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
-        uint32x4_p rk = VecLoad(subkeys+i);
-        rk = VecPermute(rk, rk, m);
-#endif
-
-        y1 = VecXor(y1, x1);
-        y2 = VecXor(y2, x2);
-        y3 = VecXor(y3, x3);
-
-        y1 = RotateRight32<3>(y1);
-        y2 = RotateRight32<3>(y2);
-        y3 = RotateRight32<3>(y3);
-
-        x1 = VecXor(x1, rk);
-        x2 = VecXor(x2, rk);
-        x3 = VecXor(x3, rk);
-
-        x1 = VecSub(x1, y1);
-        x2 = VecSub(x2, y2);
-        x3 = VecSub(x3, y3);
-
-        x1 = RotateLeft32<8>(x1);
-        x2 = RotateLeft32<8>(x2);
-        x3 = RotateLeft32<8>(x3);
-    }
-
-#if (CRYPTOPP_BIG_ENDIAN)
-    const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
-    const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
-#else
-    const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
-    const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
-#endif
-
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = (uint32x4_p)VecPermute(x1, y1, m3);
-    block1 = (uint32x4_p)VecPermute(x1, y1, m4);
-    block2 = (uint32x4_p)VecPermute(x2, y2, m3);
-    block3 = (uint32x4_p)VecPermute(x2, y2, m4);
-    block4 = (uint32x4_p)VecPermute(x3, y3, m3);
-    block5 = (uint32x4_p)VecPermute(x3, y3, m4);
-}
-
-#endif  // CRYPTOPP_ALTIVEC_AVAILABLE
-
-ANONYMOUS_NAMESPACE_END
-
-///////////////////////////////////////////////////////////////////////
-
-NAMESPACE_BEGIN(CryptoPP)
-
-// *************************** ARM NEON **************************** //
-
-#if (CRYPTOPP_ARM_NEON_AVAILABLE)
-size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-// ***************************** IA-32 ***************************** //
-
-#if (CRYPTOPP_SSE41_AVAILABLE)
-size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-// ***************************** Altivec ***************************** //
-
-#if (CRYPTOPP_ALTIVEC_AVAILABLE)
-size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-
-size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
-    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
-{
-    return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
-        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
-}
-#endif
-
-NAMESPACE_END