Cleanup PPC vector functions

The Crypto++ functions follow IBM's lead and provide VectorLoad, VectorLoadBE, VectorStore, and VectorStoreBE. Additionally, VectorLoadKey was removed in favor of vanilla VectorLoad.
2024-11-26 19:30:21 +00:00 · 2018-08-06 05:15:12 -04:00 · 2018-08-06 05:15:12 -04:00 · 6cd7f83346
commit 6cd7f83346
parent 9c27143522
3 changed files with 122 additions and 161 deletions
--- a/adv-simd.h
+++ b/adv-simd.h
@ -1849,44 +1849,44 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,

            if (flags & BT_InBlockIsCounter)
            {
-                block0 = VectorLoad(inBlocks);
+                block0 = VectorLoadBE(inBlocks);
                block1 = VectorAdd(block0, s_one);
                block2 = VectorAdd(block1, s_one);
                block3 = VectorAdd(block2, s_one);
                block4 = VectorAdd(block3, s_one);
                block5 = VectorAdd(block4, s_one);
                temp   = VectorAdd(block5, s_one);
-                VectorStore(temp, const_cast<byte*>(inBlocks));
+                VectorStoreBE(temp, const_cast<byte*>(inBlocks));
            }
            else
            {
-                block0 = VectorLoad(inBlocks);
+                block0 = VectorLoadBE(inBlocks);
                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block1 = VectorLoad(inBlocks);
+                block1 = VectorLoadBE(inBlocks);
                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block2 = VectorLoad(inBlocks);
+                block2 = VectorLoadBE(inBlocks);
                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block3 = VectorLoad(inBlocks);
+                block3 = VectorLoadBE(inBlocks);
                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block4 = VectorLoad(inBlocks);
+                block4 = VectorLoadBE(inBlocks);
                inBlocks = PtrAdd(inBlocks, inIncrement);
-                block5 = VectorLoad(inBlocks);
+                block5 = VectorLoadBE(inBlocks);
                inBlocks = PtrAdd(inBlocks, inIncrement);
            }

            if (xorInput)
            {
-                block0 = VectorXor(block0, VectorLoad(xorBlocks));
+                block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = VectorXor(block1, VectorLoad(xorBlocks));
+                block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = VectorXor(block2, VectorLoad(xorBlocks));
+                block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = VectorXor(block3, VectorLoad(xorBlocks));
+                block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block4 = VectorXor(block4, VectorLoad(xorBlocks));
+                block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block5 = VectorXor(block5, VectorLoad(xorBlocks));
+                block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
            }

@ -1894,31 +1894,31 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,

            if (xorOutput)
            {
-                block0 = VectorXor(block0, VectorLoad(xorBlocks));
+                block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block1 = VectorXor(block1, VectorLoad(xorBlocks));
+                block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block2 = VectorXor(block2, VectorLoad(xorBlocks));
+                block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block3 = VectorXor(block3, VectorLoad(xorBlocks));
+                block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block4 = VectorXor(block4, VectorLoad(xorBlocks));
+                block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
-                block5 = VectorXor(block5, VectorLoad(xorBlocks));
+                block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
                xorBlocks = PtrAdd(xorBlocks, xorIncrement);
            }

-            VectorStore(block0, outBlocks);
+            VectorStoreBE(block0, outBlocks);
            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VectorStore(block1, outBlocks);
+            VectorStoreBE(block1, outBlocks);
            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VectorStore(block2, outBlocks);
+            VectorStoreBE(block2, outBlocks);
            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VectorStore(block3, outBlocks);
+            VectorStoreBE(block3, outBlocks);
            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VectorStore(block4, outBlocks);
+            VectorStoreBE(block4, outBlocks);
            outBlocks = PtrAdd(outBlocks, outIncrement);
-            VectorStore(block5, outBlocks);
+            VectorStoreBE(block5, outBlocks);
            outBlocks = PtrAdd(outBlocks, outIncrement);

            length -= 6*blockSize;
@ -1927,10 +1927,10 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,

    while (length >= blockSize)
    {
-        uint32x4_p block = VectorLoad(inBlocks);
+        uint32x4_p block = VectorLoadBE(inBlocks);

        if (xorInput)
-            block = VectorXor(block, VectorLoad(xorBlocks));
+            block = VectorXor(block, VectorLoadBE(xorBlocks));

        if (flags & BT_InBlockIsCounter)
            const_cast<byte *>(inBlocks)[15]++;
@ -1938,9 +1938,9 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
        func1(block, subKeys, rounds);

        if (xorOutput)
-            block = VectorXor(block, VectorLoad(xorBlocks));
+            block = VectorXor(block, VectorLoadBE(xorBlocks));

-        VectorStore(block, outBlocks);
+        VectorStoreBE(block, outBlocks);

        inBlocks = PtrAdd(inBlocks, inIncrement);
        outBlocks = PtrAdd(outBlocks, outIncrement);
--- a/ppc-simd.h
+++ b/ppc-simd.h
@ -32,11 +32,11 @@
 # undef CRYPTOPP_POWER7_AVAILABLE
 #endif

-#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(_CRYPTO))
+#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(__CRYPTO) || defined(__CRYPTO__))
 # undef CRYPTOPP_POWER8_AVAILABLE
 # undef CRYPTOPP_POWER8_AES_AVAILABLE
-# undef CRYPTOPP_POWER8_SHA_AVAILABLE
 # undef CRYPTOPP_POWER8_PMULL_AVAILABLE
+# undef CRYPTOPP_POWER8_SHA_AVAILABLE
 #endif

 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
@ -65,7 +65,7 @@ typedef __vector unsigned long long uint64x2_p;
 /// \tparam T vector type
 /// \param src the vector
 /// \details Reverse() endian swaps the bytes in a vector
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
+/// \sa Reverse(), VectorLoadBE(), VectorLoad()
 /// \since Crypto++ 6.0
 template <class T>
 inline T Reverse(const T& src)
@ -151,33 +151,45 @@ inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2)
 #endif
 }

+/// \brief Shift two vectors right
+/// \tparam C shift byte count
+/// \tparam T1 vector type
+/// \tparam T2 vector type
+/// \param vec1 the first vector
+/// \param vec2 the second vector
+/// \details VectorShiftRight() concatenates vec1 and vec2 and returns a
+///   new vector after shifting the concatenation by the specified number
+///   of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return
+///   vector is the same type as vec1.
+/// \details On big endian machines VectorShiftRight() is <tt>vec_sld(a, b,
+///   c)</tt>. On little endian machines VectorShiftRight() is translated to
+///   <tt>vec_sld(b, a, 16-c)</tt>. You should always call the function as
+///   if on a big endian machine as shown below.
+/// <pre>
+///    uint8x16_p r0 = {0};
+///    uint8x16_p r1 = VectorLoad(ptr);
+///    uint8x16_p r5 = VectorShiftRight<12>(r0, r1);
+/// </pre>
+/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
+///   endian sensitive?</A> on Stack Overflow
+/// \since Crypto++ 6.0
+template <unsigned int C, class T1, class T2>
+inline T1 VectorShiftRight(const T1& vec1, const T2& vec2)
+{
+	return VectorShiftLeft<16-C>(vec1, vec2);
+}
+
 #endif  // POWER4 and above

 // POWER7/POWER4 load and store
 #if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)

-/// \brief Reverse a 16-byte array
-/// \param src the byte array
-/// \details ReverseByteArrayLE reverses a 16-byte array on a little endian
-///   system. It does nothing on a big endian system.
-/// \since Crypto++ 6.0
-inline void ReverseByteArrayLE(byte src[16])
-{
-#if defined(CRYPTOPP_XLC_VERSION) && defined(CRYPTOPP_LITTLE_ENDIAN)
-    vec_st(vec_reve(vec_ld(0, src)), 0, src);
-#elif defined(CRYPTOPP_LITTLE_ENDIAN)
-    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
-    const uint8x16_p zero = {0};
-    vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, src);
-#endif
-}
-
 /// \brief Loads a vector from a byte array
 /// \param src the byte array
 /// \details Loads a vector in big endian format from a byte array.
 ///   VectorLoadBE will swap endianess on little endian systems.
 /// \note VectorLoadBE() does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
+/// \sa Reverse(), VectorLoadBE(), VectorLoad()
 /// \since Crypto++ 6.0
 inline uint32x4_p VectorLoadBE(const uint8_t src[16])
 {
@ -198,7 +210,7 @@ inline uint32x4_p VectorLoadBE(const uint8_t src[16])
 /// \details Loads a vector in big endian format from a byte array.
 ///   VectorLoadBE will swap endianess on little endian systems.
 /// \note VectorLoadBE does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
+/// \sa Reverse(), VectorLoadBE(), VectorLoad()
 /// \since Crypto++ 6.0
 inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16])
 {
@ -215,70 +227,27 @@ inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16])

 /// \brief Loads a vector from a byte array
 /// \param src the byte array
-/// \details Loads a vector in big endian format from a byte array.
-///   VectorLoad will swap endianess on little endian systems.
+/// \details Loads a vector in native endian format from a byte array.
 /// \note VectorLoad does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
+/// \sa Reverse(), VectorLoadBE(), VectorLoad()
 /// \since Crypto++ 6.0
 inline uint32x4_p VectorLoad(const byte src[16])
 {
-    return (uint32x4_p)VectorLoadBE(src);
+#if defined(CRYPTOPP_XLC_VERSION)
+    return (uint32x4_p)vec_xl(0, (byte*)src);
+#else
+    return (uint32x4_p)vec_vsx_ld(0, src);
+#endif
 }

 /// \brief Loads a vector from a byte array
 /// \param src the byte array
 /// \param off offset into the src byte array
-/// \details Loads a vector in big endian format from a byte array.
-///   VectorLoad will swap endianess on little endian systems.
+/// \details Loads a vector in native endian format from a byte array.
 /// \note VectorLoad does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
+/// \sa Reverse(), VectorLoadBE(), VectorLoad()
 /// \since Crypto++ 6.0
 inline uint32x4_p VectorLoad(int off, const byte src[16])
-{
-    return (uint32x4_p)VectorLoadBE(off, src);
-}
-
-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \details Loads a vector from a byte array.
-///   VectorLoadKey does not swap endianess on little endian systems.
-/// \note VectorLoadKey does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoadKey(const byte src[16])
-{
-#if defined(CRYPTOPP_XLC_VERSION)
-    return (uint32x4_p)vec_xl(0, (byte*)src);
-#else
-    return (uint32x4_p)vec_vsx_ld(0, src);
-#endif
-}
-
-/// \brief Loads a vector from a 32-bit word array
-/// \param src the 32-bit word array
-/// \details Loads a vector from a 32-bit word array.
-///   VectorLoadKey does not swap endianess on little endian systems.
-/// \note VectorLoadKey does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoadKey(const word32 src[4])
-{
-#if defined(CRYPTOPP_XLC_VERSION)
-    return (uint32x4_p)vec_xl(0, (byte*)src);
-#else
-    return (uint32x4_p)vec_vsx_ld(0, src);
-#endif
-}
-
-/// \brief Loads a vector from a byte array
-/// \param src the byte array
-/// \param off offset into the src byte array
-/// \details Loads a vector from a byte array.
-///   VectorLoadKey does not swap endianess on little endian systems.
-/// \note VectorLoadKey does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
-/// \since Crypto++ 6.0
-inline uint32x4_p VectorLoadKey(int off, const byte src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
    return (uint32x4_p)vec_xl(off, (byte*)src);
@ -294,7 +263,7 @@ inline uint32x4_p VectorLoadKey(int off, const byte src[16])
 /// \details Stores a vector in big endian format to a byte array.
 ///   VectorStoreBE will swap endianess on little endian systems.
 /// \note VectorStoreBE does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
+/// \sa Reverse(), VectorLoadBE(), VectorLoad()
 /// \since Crypto++ 6.0
 template <class T>
 inline void VectorStoreBE(const T& src, uint8_t dest[16])
@ -318,7 +287,7 @@ inline void VectorStoreBE(const T& src, uint8_t dest[16])
 /// \details Stores a vector in big endian format to a byte array.
 ///   VectorStoreBE will swap endianess on little endian systems.
 /// \note VectorStoreBE does not require an aligned array.
-/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey()
+/// \sa Reverse(), VectorLoadBE(), VectorLoad()
 /// \since Crypto++ 6.0
 template <class T>
 inline void VectorStoreBE(const T& src, int off, uint8_t dest[16])
@ -338,8 +307,7 @@ inline void VectorStoreBE(const T& src, int off, uint8_t dest[16])
 /// \tparam T vector type
 /// \param src the vector
 /// \param dest the byte array
-/// \details Stores a vector in big endian format to a byte array.
-///   VectorStore will swap endianess on little endian systems.
+/// \details Stores a vector in native endian format to a byte array.
 /// \note VectorStore does not require an aligned array.
 /// \since Crypto++ 6.0
 template<class T>
@ -347,13 +315,9 @@ inline void VectorStore(const T& src, byte dest[16])
 {
    // Do not call VectorStoreBE. It slows us down by about 0.5 cpb on LE.
 #if defined(CRYPTOPP_XLC_VERSION)
-    vec_xst_be((uint8x16_p)src, 0, dest);
+    vec_xst((uint8x16_p)src, 0, dest);
 #else
-# if defined(CRYPTOPP_LITTLE_ENDIAN)
-    vec_vsx_st(Reverse((uint8x16_p)src), 0, dest);
-# else
    vec_vsx_st((uint8x16_p)src, 0, dest);
-# endif
 #endif
 }

@ -362,8 +326,7 @@ inline void VectorStore(const T& src, byte dest[16])
 /// \param src the vector
 /// \param off offset into the dest byte array
 /// \param dest the byte array
-/// \details Stores a vector in big endian format to a byte array.
-///   VectorStore will swap endianess on little endian systems.
+/// \details Stores a vector in native endian format to a byte array.
 /// \note VectorStore does not require an aligned array.
 /// \since Crypto++ 6.0
 template<class T>
@ -371,13 +334,9 @@ inline void VectorStore(const T& src, int off, byte dest[16])
 {
    // Do not call VectorStoreBE. It slows us down by about 0.5 cpb on LE.
 #if defined(CRYPTOPP_XLC_VERSION)
-    vec_xst_be((uint8x16_p)src, off, dest);
+    vec_xst((uint8x16_p)src, off, dest);
 #else
-# if defined(CRYPTOPP_LITTLE_ENDIAN)
-    vec_vsx_st(Reverse((uint8x16_p)src), off, dest);
-# else
    vec_vsx_st((uint8x16_p)src, off, dest);
-# endif
 #endif
 }

--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@ -697,17 +697,17 @@ static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, un
    CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
    const byte *keys = reinterpret_cast<const byte*>(subkeys);

-    uint32x4_p k = VectorLoadKey(keys);
+    uint32x4_p k = VectorLoad(keys);
    block = VectorXor(block, k);

    for (size_t i=1; i<rounds-1; i+=2)
    {
-        block = VectorEncrypt(block, VectorLoadKey(  i*16,   keys));
-        block = VectorEncrypt(block, VectorLoadKey((i+1)*16, keys));
+        block = VectorEncrypt(block, VectorLoad(  i*16,   keys));
+        block = VectorEncrypt(block, VectorLoad((i+1)*16, keys));
    }

-    block = VectorEncrypt(block, VectorLoadKey((rounds-1)*16, keys));
-    block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
+    block = VectorEncrypt(block, VectorLoad((rounds-1)*16, keys));
+    block = VectorEncryptLast(block, VectorLoad(rounds*16, keys));
 }

 static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -717,7 +717,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
    CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
    const byte *keys = reinterpret_cast<const byte*>(subkeys);

-    uint32x4_p k = VectorLoadKey(keys);
+    uint32x4_p k = VectorLoad(keys);
    block0 = VectorXor(block0, k);
    block1 = VectorXor(block1, k);
    block2 = VectorXor(block2, k);
@ -727,7 +727,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,

    for (size_t i=1; i<rounds; ++i)
    {
-        k = VectorLoadKey(i*16, keys);
+        k = VectorLoad(i*16, keys);
        block0 = VectorEncrypt(block0, k);
        block1 = VectorEncrypt(block1, k);
        block2 = VectorEncrypt(block2, k);
@ -736,7 +736,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
        block5 = VectorEncrypt(block5, k);
    }

-    k = VectorLoadKey(rounds*16, keys);
+    k = VectorLoad(rounds*16, keys);
    block0 = VectorEncryptLast(block0, k);
    block1 = VectorEncryptLast(block1, k);
    block2 = VectorEncryptLast(block2, k);
@ -750,17 +750,17 @@ static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, un
    CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
    const byte *keys = reinterpret_cast<const byte*>(subkeys);

-    uint32x4_p k = VectorLoadKey(rounds*16, keys);
+    uint32x4_p k = VectorLoad(rounds*16, keys);
    block = VectorXor(block, k);

    for (size_t i=rounds-1; i>1; i-=2)
    {
-        block = VectorDecrypt(block, VectorLoadKey(  i*16,   keys));
-        block = VectorDecrypt(block, VectorLoadKey((i-1)*16, keys));
+        block = VectorDecrypt(block, VectorLoad(  i*16,   keys));
+        block = VectorDecrypt(block, VectorLoad((i-1)*16, keys));
    }

-    block = VectorDecrypt(block, VectorLoadKey(16, keys));
-    block = VectorDecryptLast(block, VectorLoadKey(0, keys));
+    block = VectorDecrypt(block, VectorLoad(16, keys));
+    block = VectorDecryptLast(block, VectorLoad(0, keys));
 }

 static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -770,7 +770,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
    CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
    const byte *keys = reinterpret_cast<const byte*>(subkeys);

-    uint32x4_p k = VectorLoadKey(rounds*16, keys);
+    uint32x4_p k = VectorLoad(rounds*16, keys);
    block0 = VectorXor(block0, k);
    block1 = VectorXor(block1, k);
    block2 = VectorXor(block2, k);
@ -780,7 +780,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,

    for (size_t i=rounds-1; i>0; --i)
    {
-        k = VectorLoadKey(i*16, keys);
+        k = VectorLoad(i*16, keys);
        block0 = VectorDecrypt(block0, k);
        block1 = VectorDecrypt(block1, k);
        block2 = VectorDecrypt(block2, k);
@ -789,7 +789,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
        block5 = VectorDecrypt(block5, k);
    }

-    k = VectorLoadKey(0, keys);
+    k = VectorLoad(0, keys);
    block0 = VectorDecryptLast(block0, k);
    block1 = VectorDecryptLast(block1, k);
    block2 = VectorDecryptLast(block2, k);
@ -804,60 +804,62 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32*
 {
    const size_t rounds = keyLen / 4 + 6;
    const word32 *rc = s_rconBE;
+	word32 *rkey = rk, temp;

-    GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
-    word32 *rk_saved = rk, temp; // unused in big-endian
-    CRYPTOPP_UNUSED(rk_saved);
+    GetUserKey(BIG_ENDIAN_ORDER, rkey, keyLen/4, userKey, keyLen);

    // keySize: m_key allocates 4*(rounds+1) word32's.
    const size_t keySize = 4*(rounds+1);
-    const word32* end = rk + keySize;
+    const word32* end = rkey + keySize;

    while (true)
    {
-        temp  = rk[keyLen/4-1];
+        temp  = rkey[keyLen/4-1];
        word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
                    (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
-        rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
-        rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
-        rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
-        rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
+        rkey[keyLen/4] = rkey[0] ^ x ^ *(rc++);
+        rkey[keyLen/4+1] = rkey[1] ^ rkey[keyLen/4];
+        rkey[keyLen/4+2] = rkey[2] ^ rkey[keyLen/4+1];
+        rkey[keyLen/4+3] = rkey[3] ^ rkey[keyLen/4+2];

-        if (rk + keyLen/4 + 4 == end)
+        if (rkey + keyLen/4 + 4 == end)
            break;

        if (keyLen == 24)
        {
-            rk[10] = rk[ 4] ^ rk[ 9];
-            rk[11] = rk[ 5] ^ rk[10];
+            rkey[10] = rkey[ 4] ^ rkey[ 9];
+            rkey[11] = rkey[ 5] ^ rkey[10];
        }
        else if (keyLen == 32)
        {
-            temp = rk[11];
-            rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
-            rk[13] = rk[ 5] ^ rk[12];
-            rk[14] = rk[ 6] ^ rk[13];
-            rk[15] = rk[ 7] ^ rk[14];
+            temp = rkey[11];
+            rkey[12] = rkey[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
+            rkey[13] = rkey[ 5] ^ rkey[12];
+            rkey[14] = rkey[ 6] ^ rkey[13];
+            rkey[15] = rkey[ 7] ^ rkey[14];
        }
-        rk += keyLen/4;
+        rkey += keyLen/4;
    }

 #if defined(CRYPTOPP_LITTLE_ENDIAN)
-    rk = rk_saved;
+    rkey = rk;
    const uint8x16_p mask = ((uint8x16_p){12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3});
    const uint8x16_p zero = {0};

    unsigned int i=0;
-    for (i=0; i<rounds; i+=2, rk+=8)
+    for (i=0; i<rounds; i+=2, rkey+=8)
    {
-        const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rk);
-        const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rk);
-        vec_vsx_st(vec_perm(d1, zero, mask),  0, (uint8_t*)rk);
-        vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rk);
+        const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rkey);
+        const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rkey);
+        vec_vsx_st(vec_perm(d1, zero, mask),  0, (uint8_t*)rkey);
+        vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rkey);
    }

-    for ( ; i<rounds+1; i++, rk+=4)
-        vec_vsx_st(vec_perm(vec_vsx_ld(0, (uint8_t*)rk), zero, mask), 0, (uint8_t*)rk);
+    for ( ; i<rounds+1; i++, rkey+=4)
+	{
+        const uint8x16_p d = vec_vsx_ld( 0, (uint8_t*)rkey);
+        vec_vsx_st(vec_perm(d, zero, mask),  0, (uint8_t*)rkey);
+	}
 #endif
 }