From f6e04e5f338d2573f182a2daabed3220ce3dda7e Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Thu, 15 Nov 2018 15:17:49 -0500 Subject: [PATCH] Rename PPC vector functions from VectorFunc to VecFunc --- adv_simd.h | 216 +++++++++++++++--------------- blake2b_simd.cpp | 124 ++++++++--------- blake2s_simd.cpp | 118 ++++++++-------- chacha_simd.cpp | 335 +++++++++++++++++++++++----------------------- gcm_simd.cpp | 136 +++++++++---------- lea_simd.cpp | 18 +-- ppc_simd.cpp | 2 +- ppc_simd.h | 314 +++++++++++++++++++++---------------------- rijndael_simd.cpp | 132 +++++++++--------- sha_simd.cpp | 290 +++++++++++++++++++-------------------- simon128_simd.cpp | 117 ++++++++-------- simon64_simd.cpp | 173 ++++++++++++------------ speck128_simd.cpp | 115 ++++++++-------- speck64_simd.cpp | 137 +++++++++---------- validat1.cpp | 48 +++---- 15 files changed, 1140 insertions(+), 1135 deletions(-) diff --git a/adv_simd.h b/adv_simd.h index c79a9f25..547cd262 100644 --- a/adv_simd.h +++ b/adv_simd.h @@ -1857,54 +1857,54 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6, // even harder without POWER8 due to lack of 64-bit elements. std::memcpy(temp+LowOffset, inBlocks, 8); std::memcpy(temp+HighOffset, inBlocks, 8); - uint32x4_p ctr = (uint32x4_p)VectorLoadBE(temp); + uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp); // For 64-bit block ciphers we need to load the CTR block, // which is 8 bytes. After the dup load we have two counters // in the Altivec word. Then we need to increment the low ctr // by 0 and the high ctr by 1. - block0 = VectorAdd(s_one, ctr); + block0 = VecAdd(s_one, ctr); // After initial increment of {0,1} remaining counters // increment by {2,2}. - block1 = VectorAdd(s_two, block0); - block2 = VectorAdd(s_two, block1); - block3 = VectorAdd(s_two, block2); - block4 = VectorAdd(s_two, block3); - block5 = VectorAdd(s_two, block4); + block1 = VecAdd(s_two, block0); + block2 = VecAdd(s_two, block1); + block3 = VecAdd(s_two, block2); + block4 = VecAdd(s_two, block3); + block5 = VecAdd(s_two, block4); // Update the counter in the caller. const_cast(inBlocks)[7] += 12; } else { - block0 = VectorLoadBE(inBlocks); + block0 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = VectorLoadBE(inBlocks); + block1 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block2 = VectorLoadBE(inBlocks); + block2 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block3 = VectorLoadBE(inBlocks); + block3 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block4 = VectorLoadBE(inBlocks); + block4 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block5 = VectorLoadBE(inBlocks); + block5 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); } if (xorInput) { - block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + block0 = VecXor(block0, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + block1 = VecXor(block1, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); + block2 = VecXor(block2, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); + block3 = VecXor(block3, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = VectorXor(block4, VectorLoadBE(xorBlocks)); + block4 = VecXor(block4, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = VectorXor(block5, VectorLoadBE(xorBlocks)); + block5 = VecXor(block5, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } @@ -1912,31 +1912,31 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6, if (xorOutput) { - block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + block0 = VecXor(block0, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + block1 = VecXor(block1, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); + block2 = VecXor(block2, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); + block3 = VecXor(block3, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = VectorXor(block4, VectorLoadBE(xorBlocks)); + block4 = VecXor(block4, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = VectorXor(block5, VectorLoadBE(xorBlocks)); + block5 = VecXor(block5, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } - VectorStoreBE(block0, outBlocks); + VecStoreBE(block0, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block1, outBlocks); + VecStoreBE(block1, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block2, outBlocks); + VecStoreBE(block2, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block3, outBlocks); + VecStoreBE(block3, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block4, outBlocks); + VecStoreBE(block4, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block5, outBlocks); + VecStoreBE(block5, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); length -= 6*vsxBlockSize; @@ -1951,34 +1951,34 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6, // even harder without POWER8 due to lack of 64-bit elements. std::memcpy(temp+LowOffset, inBlocks, 8); std::memcpy(temp+HighOffset, inBlocks, 8); - uint32x4_p ctr = (uint32x4_p)VectorLoadBE(temp); + uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp); // For 64-bit block ciphers we need to load the CTR block, // which is 8 bytes. After the dup load we have two counters // in the Altivec word. Then we need to increment the low ctr // by 0 and the high ctr by 1. - block0 = VectorAdd(s_one, ctr); + block0 = VecAdd(s_one, ctr); // After initial increment of {0,1} remaining counters // increment by {2,2}. - block1 = VectorAdd(s_two, block0); + block1 = VecAdd(s_two, block0); // Update the counter in the caller. const_cast(inBlocks)[7] += 4; } else { - block0 = VectorLoadBE(inBlocks); + block0 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = VectorLoadBE(inBlocks); + block1 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); } if (xorInput) { - block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + block0 = VecXor(block0, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + block1 = VecXor(block1, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } @@ -1986,15 +1986,15 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6, if (xorOutput) { - block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + block0 = VecXor(block0, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + block1 = VecXor(block1, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } - VectorStoreBE(block0, outBlocks); + VecStoreBE(block0, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block1, outBlocks); + VecStoreBE(block1, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); length -= 2*vsxBlockSize; @@ -2030,14 +2030,14 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6, // initialize the block then it generates warnings. std::memcpy(temp+LowOffset, inBlocks, 8); std::memcpy(temp+HighOffset, inBlocks, 8); // don't care - block = (uint32x4_p)VectorLoadBE(temp); + block = (uint32x4_p)VecLoadBE(temp); if (xorInput) { std::memcpy(temp+LowOffset, xorBlocks, 8); std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care - uint32x4_p x = (uint32x4_p)VectorLoadBE(temp); - block = VectorXor(block, x); + uint32x4_p x = (uint32x4_p)VecLoadBE(temp); + block = VecXor(block, x); } // Update the counter in the caller. @@ -2050,11 +2050,11 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6, { std::memcpy(temp+LowOffset, xorBlocks, 8); std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care - uint32x4_p x = (uint32x4_p)VectorLoadBE(temp); - block = VectorXor(block, x); + uint32x4_p x = (uint32x4_p)VecLoadBE(temp); + block = VecXor(block, x); } - VectorStoreBE(block, temp); + VecStoreBE(block, temp); std::memcpy(outBlocks, temp+LowOffset, 8); inBlocks = PtrAdd(inBlocks, inIncrement); @@ -2120,10 +2120,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, if (flags & BT_InBlockIsCounter) { - block0 = VectorLoadBE(inBlocks); - block1 = VectorAdd(block0, s_one); - block2 = VectorAdd(block1, s_one); - block3 = VectorAdd(block2, s_one); + block0 = VecLoadBE(inBlocks); + block1 = VecAdd(block0, s_one); + block2 = VecAdd(block1, s_one); + block3 = VecAdd(block2, s_one); // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE). // CTR_ModePolicy::OperateKeystream is wired such that after @@ -2137,25 +2137,25 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, } else { - block0 = VectorLoadBE(inBlocks); + block0 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = VectorLoadBE(inBlocks); + block1 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block2 = VectorLoadBE(inBlocks); + block2 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block3 = VectorLoadBE(inBlocks); + block3 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); } if (xorInput) { - block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + block0 = VecXor(block0, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + block1 = VecXor(block1, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); + block2 = VecXor(block2, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); + block3 = VecXor(block3, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } @@ -2163,23 +2163,23 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, if (xorOutput) { - block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + block0 = VecXor(block0, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + block1 = VecXor(block1, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); + block2 = VecXor(block2, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); + block3 = VecXor(block3, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } - VectorStoreBE(block0, outBlocks); + VecStoreBE(block0, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block1, outBlocks); + VecStoreBE(block1, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block2, outBlocks); + VecStoreBE(block2, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block3, outBlocks); + VecStoreBE(block3, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); length -= 4*blockSize; @@ -2188,10 +2188,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, while (length >= blockSize) { - uint32x4_p block = VectorLoadBE(inBlocks); + uint32x4_p block = VecLoadBE(inBlocks); if (xorInput) - block = VectorXor(block, VectorLoadBE(xorBlocks)); + block = VecXor(block, VecLoadBE(xorBlocks)); if (flags & BT_InBlockIsCounter) const_cast(inBlocks)[15]++; @@ -2199,9 +2199,9 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, func1(block, subKeys, rounds); if (xorOutput) - block = VectorXor(block, VectorLoadBE(xorBlocks)); + block = VecXor(block, VecLoadBE(xorBlocks)); - VectorStoreBE(block, outBlocks); + VecStoreBE(block, outBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); outBlocks = PtrAdd(outBlocks, outIncrement); @@ -2265,12 +2265,12 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, if (flags & BT_InBlockIsCounter) { - block0 = VectorLoadBE(inBlocks); - block1 = VectorAdd(block0, s_one); - block2 = VectorAdd(block1, s_one); - block3 = VectorAdd(block2, s_one); - block4 = VectorAdd(block3, s_one); - block5 = VectorAdd(block4, s_one); + block0 = VecLoadBE(inBlocks); + block1 = VecAdd(block0, s_one); + block2 = VecAdd(block1, s_one); + block3 = VecAdd(block2, s_one); + block4 = VecAdd(block3, s_one); + block5 = VecAdd(block4, s_one); // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE). // CTR_ModePolicy::OperateKeystream is wired such that after @@ -2286,38 +2286,38 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, // the issue. If the last octet was 0xFC then 4 would trigger it. // We dumb-lucked into the test with SPECK-128. The test case of // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA. - uint8x16_p temp = VectorAdd((uint8x16_p)block5, (uint8x16_p)s_one); - VectorStoreBE(temp, const_cast(inBlocks)); + uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one); + VecStoreBE(temp, const_cast(inBlocks)); } else { - block0 = VectorLoadBE(inBlocks); + block0 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = VectorLoadBE(inBlocks); + block1 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block2 = VectorLoadBE(inBlocks); + block2 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block3 = VectorLoadBE(inBlocks); + block3 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block4 = VectorLoadBE(inBlocks); + block4 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block5 = VectorLoadBE(inBlocks); + block5 = VecLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); } if (xorInput) { - block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + block0 = VecXor(block0, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + block1 = VecXor(block1, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); + block2 = VecXor(block2, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); + block3 = VecXor(block3, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = VectorXor(block4, VectorLoadBE(xorBlocks)); + block4 = VecXor(block4, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = VectorXor(block5, VectorLoadBE(xorBlocks)); + block5 = VecXor(block5, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } @@ -2325,31 +2325,31 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, if (xorOutput) { - block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + block0 = VecXor(block0, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + block1 = VecXor(block1, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); + block2 = VecXor(block2, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); + block3 = VecXor(block3, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = VectorXor(block4, VectorLoadBE(xorBlocks)); + block4 = VecXor(block4, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = VectorXor(block5, VectorLoadBE(xorBlocks)); + block5 = VecXor(block5, VecLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } - VectorStoreBE(block0, outBlocks); + VecStoreBE(block0, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block1, outBlocks); + VecStoreBE(block1, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block2, outBlocks); + VecStoreBE(block2, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block3, outBlocks); + VecStoreBE(block3, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block4, outBlocks); + VecStoreBE(block4, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStoreBE(block5, outBlocks); + VecStoreBE(block5, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); length -= 6*blockSize; @@ -2358,10 +2358,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, while (length >= blockSize) { - uint32x4_p block = VectorLoadBE(inBlocks); + uint32x4_p block = VecLoadBE(inBlocks); if (xorInput) - block = VectorXor(block, VectorLoadBE(xorBlocks)); + block = VecXor(block, VecLoadBE(xorBlocks)); if (flags & BT_InBlockIsCounter) const_cast(inBlocks)[15]++; @@ -2369,9 +2369,9 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, func1(block, subKeys, rounds); if (xorOutput) - block = VectorXor(block, VectorLoadBE(xorBlocks)); + block = VecXor(block, VecLoadBE(xorBlocks)); - VectorStoreBE(block, outBlocks); + VecStoreBE(block, outBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); outBlocks = PtrAdd(outBlocks, outIncrement); diff --git a/blake2b_simd.cpp b/blake2b_simd.cpp index 853a4cb5..ee701bd4 100644 --- a/blake2b_simd.cpp +++ b/blake2b_simd.cpp @@ -742,7 +742,7 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state) #if (CRYPTOPP_POWER8_AVAILABLE) -inline uint64x2_p VectorLoad64(const void* p) +inline uint64x2_p VecLoad64(const void* p) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (uint64x2_p)vec_xl(0, (uint8_t*)p); @@ -751,18 +751,18 @@ inline uint64x2_p VectorLoad64(const void* p) #endif } -inline uint64x2_p VectorLoad64LE(const void* p) +inline uint64x2_p VecLoad64LE(const void* p) { #if __BIG_ENDIAN__ const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8}; - const uint64x2_p v = VectorLoad64(p); - return vec_perm(v, v, m); + const uint64x2_p v = VecLoad64(p); + return VecPermute(v, v, m); #else - return VectorLoad64(p); + return VecLoad64(p); #endif } -inline void VectorStore64(void* p, const uint64x2_p x) +inline void VecStore64(void* p, const uint64x2_p x) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) vec_xst((uint8x16_p)x,0,(uint8_t*)p); @@ -771,18 +771,18 @@ inline void VectorStore64(void* p, const uint64x2_p x) #endif } -inline void VectorStore64LE(void* p, const uint64x2_p x) +inline void VecStore64LE(void* p, const uint64x2_p x) { #if __BIG_ENDIAN__ const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8}; - VectorStore64(p, vec_perm(x, x, m)); + VecStore64(p, VecPermute(x, x, m)); #else - VectorStore64(p, x); + VecStore64(p, x); #endif } template -inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b) +inline uint64x2_p VecShiftLeftOctet(const uint64x2_p a, const uint64x2_p b) { #if __BIG_ENDIAN__ return (uint64x2_p)vec_sld((uint8x16_p)a, (uint8x16_p)b, C); @@ -791,18 +791,18 @@ inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b) #endif } -#define vec_shl_octet(a,b,c) VectorShiftLeftOctet(a, b) +#define vec_shl_octet(a,b,c) VecShiftLeftOctet(a, b) -// vec_mergeh(a,b) is equivalent to vec_perm(a,b,HH_MASK); and -// vec_mergel(a,b) is equivalent vec_perm(a,b,LL_MASK). Benchmarks +// vec_mergeh(a,b) is equivalent to VecPermute(a,b,HH_MASK); and +// vec_mergel(a,b) is equivalent VecPermute(a,b,LL_MASK). Benchmarks // show vec_mergeh and vec_mergel is faster on little-endian -// machines by 0.4 cpb. Benchmarks show vec_perm is faster on +// machines by 0.4 cpb. Benchmarks show VecPermute is faster on // big-endian machines by 1.5 cpb. The code that uses // vec_mergeh and vec_mergel is about 880 bytes shorter. #if defined(__GNUC__) && (__BIG_ENDIAN__) -# define vec_merge_hi(a,b) vec_perm(a,b, HH_MASK) -# define vec_merge_lo(a,b) vec_perm(a,b, LL_MASK) +# define vec_merge_hi(a,b) VecPermute(a,b, HH_MASK) +# define vec_merge_lo(a,b) VecPermute(a,b, LL_MASK) #else # define vec_merge_hi(a,b) vec_mergeh(a,b) # define vec_merge_lo(a,b) vec_mergel(a,b) @@ -878,12 +878,12 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ do { \ b0 = vec_merge_hi(m4, m0); \ - b1 = vec_perm(m1, m6, HL_MASK); \ + b1 = VecPermute(m1, m6, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ do { \ - b0 = vec_perm(m5, m1, HL_MASK); \ + b0 = VecPermute(m5, m1, HL_MASK); \ b1 = vec_merge_lo(m3, m4); \ } while(0) @@ -907,8 +907,8 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ do { \ - b0 = vec_perm(m1, m2, HL_MASK); \ - b1 = vec_perm(m2, m7, HL_MASK); \ + b0 = VecPermute(m1, m2, HL_MASK); \ + b1 = VecPermute(m2, m7, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ @@ -925,20 +925,20 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ do { \ - b0 = vec_perm(m0, m3, HL_MASK); \ - b1 = vec_perm(m2, m7, HL_MASK); \ + b0 = VecPermute(m0, m3, HL_MASK); \ + b1 = VecPermute(m2, m7, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ do { \ - b0 = vec_perm(m7, m5, HL_MASK); \ - b1 = vec_perm(m3, m1, HL_MASK); \ + b0 = VecPermute(m7, m5, HL_MASK); \ + b1 = VecPermute(m3, m1, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ do { \ b0 = vec_shl_octet(m0, m6, 1); \ - b1 = vec_perm(m4, m6, HL_MASK); \ + b1 = VecPermute(m4, m6, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ @@ -955,19 +955,19 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ do { \ - b0 = vec_perm(m2, m3, HL_MASK); \ + b0 = VecPermute(m2, m3, HL_MASK); \ b1 = vec_merge_lo(m7, m0); \ } while(0) #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ do { \ b0 = vec_merge_lo(m6, m2); \ - b1 = vec_perm(m7, m4, HL_MASK); \ + b1 = VecPermute(m7, m4, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ do { \ - b0 = vec_perm(m6, m0, HL_MASK); \ + b0 = VecPermute(m6, m0, HL_MASK); \ b1 = vec_merge_hi(m7, m2); \ } while(0) @@ -986,13 +986,13 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ do { \ b0 = vec_merge_lo(m3, m1); \ - b1 = vec_perm(m1, m5, HL_MASK); \ + b1 = VecPermute(m1, m5, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ do { \ b0 = vec_merge_lo(m6, m3); \ - b1 = vec_perm(m6, m1, HL_MASK); \ + b1 = VecPermute(m6, m1, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ @@ -1033,7 +1033,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ do { \ - b0 = vec_perm(m1, m3, HL_MASK); \ + b0 = VecPermute(m1, m3, HL_MASK); \ b1 = m2; \ } while(0) @@ -1046,7 +1046,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ do { \ b0 = vec_merge_hi(m1, m2); \ - b1 = vec_perm(m3, m2, HL_MASK); \ + b1 = VecPermute(m3, m2, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ @@ -1122,23 +1122,23 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ do { \ - row1l = vec_add(vec_add(row1l, b0), row2l); \ - row1h = vec_add(vec_add(row1h, b1), row2h); \ - row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \ + row1l = VecAdd(VecAdd(row1l, b0), row2l); \ + row1h = VecAdd(VecAdd(row1h, b1), row2h); \ + row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \ - row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \ - row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \ + row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ + row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \ } while(0) #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ do { \ - row1l = vec_add(vec_add(row1l, b0), row2l); \ - row1h = vec_add(vec_add(row1h, b1), row2h); \ - row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \ + row1l = VecAdd(VecAdd(row1l, b0), row2l); \ + row1h = VecAdd(VecAdd(row1h, b1), row2h); \ + row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \ - row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \ - row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \ + row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ + row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \ } while(0) @@ -1175,27 +1175,27 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ } while(0) - const uint64x2_p m0 = VectorLoad64LE(input + 00); - const uint64x2_p m1 = VectorLoad64LE(input + 16); - const uint64x2_p m2 = VectorLoad64LE(input + 32); - const uint64x2_p m3 = VectorLoad64LE(input + 48); - const uint64x2_p m4 = VectorLoad64LE(input + 64); - const uint64x2_p m5 = VectorLoad64LE(input + 80); - const uint64x2_p m6 = VectorLoad64LE(input + 96); - const uint64x2_p m7 = VectorLoad64LE(input + 112); + const uint64x2_p m0 = VecLoad64LE(input + 00); + const uint64x2_p m1 = VecLoad64LE(input + 16); + const uint64x2_p m2 = VecLoad64LE(input + 32); + const uint64x2_p m3 = VecLoad64LE(input + 48); + const uint64x2_p m4 = VecLoad64LE(input + 64); + const uint64x2_p m5 = VecLoad64LE(input + 80); + const uint64x2_p m6 = VecLoad64LE(input + 96); + const uint64x2_p m7 = VecLoad64LE(input + 112); uint64x2_p row1l, row1h, row2l, row2h; uint64x2_p row3l, row3h, row4l, row4h; - const uint64x2_p h0 = row1l = VectorLoad64LE(&state.h[0]); - const uint64x2_p h1 = row1h = VectorLoad64LE(&state.h[2]); - const uint64x2_p h2 = row2l = VectorLoad64LE(&state.h[4]); - const uint64x2_p h3 = row2h = VectorLoad64LE(&state.h[6]); + const uint64x2_p h0 = row1l = VecLoad64LE(&state.h[0]); + const uint64x2_p h1 = row1h = VecLoad64LE(&state.h[2]); + const uint64x2_p h2 = row2l = VecLoad64LE(&state.h[4]); + const uint64x2_p h3 = row2h = VecLoad64LE(&state.h[6]); - row3l = VectorLoad64(&BLAKE2B_IV[0]); - row3h = VectorLoad64(&BLAKE2B_IV[2]); - row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.tf[0])); - row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.tf[2])); + row3l = VecLoad64(&BLAKE2B_IV[0]); + row3h = VecLoad64(&BLAKE2B_IV[2]); + row4l = VecXor(VecLoad64(&BLAKE2B_IV[4]), VecLoad64(&state.tf[0])); + row4h = VecXor(VecLoad64(&BLAKE2B_IV[6]), VecLoad64(&state.tf[2])); BLAKE2B_ROUND(0); BLAKE2B_ROUND(1); @@ -1210,10 +1210,10 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) BLAKE2B_ROUND(10); BLAKE2B_ROUND(11); - VectorStore64LE(&state.h[0], vec_xor(h0, vec_xor(row1l, row3l))); - VectorStore64LE(&state.h[2], vec_xor(h1, vec_xor(row1h, row3h))); - VectorStore64LE(&state.h[4], vec_xor(h2, vec_xor(row2l, row4l))); - VectorStore64LE(&state.h[6], vec_xor(h3, vec_xor(row2h, row4h))); + VecStore64LE(&state.h[0], VecXor(h0, VecXor(row1l, row3l))); + VecStore64LE(&state.h[2], VecXor(h1, VecXor(row1h, row3h))); + VecStore64LE(&state.h[4], VecXor(h2, VecXor(row2l, row4l))); + VecStore64LE(&state.h[6], VecXor(h3, VecXor(row2h, row4h))); } #endif // CRYPTOPP_POWER8_AVAILABLE diff --git a/blake2s_simd.cpp b/blake2s_simd.cpp index cc1ec137..1456ad23 100644 --- a/blake2s_simd.cpp +++ b/blake2s_simd.cpp @@ -683,34 +683,34 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state) #if (CRYPTOPP_ALTIVEC_AVAILABLE) -inline uint32x4_p VectorLoad32(const void* p) +inline uint32x4_p VecLoad32(const void* p) { - return VectorLoad((const word32*)p); + return VecLoad((const word32*)p); } -inline uint32x4_p VectorLoad32LE(const void* p) +inline uint32x4_p VecLoad32LE(const void* p) { #if __BIG_ENDIAN__ const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; - const uint32x4_p v = VectorLoad((const word32*)p); - return vec_perm(v, v, m); + const uint32x4_p v = VecLoad((const word32*)p); + return VecPermute(v, v, m); #else - return VectorLoad((const word32*)p); + return VecLoad((const word32*)p); #endif } -inline void VectorStore32(void* p, const uint32x4_p x) +inline void VecStore32(void* p, const uint32x4_p x) { - VectorStore(x, (word32*)p); + VecStore(x, (word32*)p); } -inline void VectorStore32LE(void* p, const uint32x4_p x) +inline void VecStore32LE(void* p, const uint32x4_p x) { #if __BIG_ENDIAN__ const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; - VectorStore(vec_perm(x, x, m), (word32*)p); + VecStore(VecPermute(x, x, m), (word32*)p); #else - VectorStore(x, (word32*)p); + VecStore(x, (word32*)p); #endif } @@ -718,7 +718,7 @@ template inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b) { // Re-index. I'd like to use something like Z=Y*4 and then - // VectorShiftLeftOctet(b) but it crashes early Red Hat + // VecShiftLeftOctet(b) but it crashes early Red Hat // GCC compilers. enum {X=E1&3, Y=E2&3}; @@ -729,88 +729,88 @@ inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b) if (X == 0 && Y == 0) { const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, b, mask); + return VecPermute(a, b, mask); } else if (X == 0 && Y == 1) { const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); + return VecPermute(a, VecShiftLeftOctet<4>(b), mask); } else if (X == 0 && Y == 2) { const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); + return VecPermute(a, VecShiftLeftOctet<8>(b), mask); } else if (X == 0 && Y == 3) { const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); + return VecPermute(a, VecShiftLeftOctet<12>(b), mask); } // Element 1 combinations else if (X == 1 && Y == 0) { const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, b, mask); + return VecPermute(a, b, mask); } else if (X == 1 && Y == 1) { const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); + return VecPermute(a, VecShiftLeftOctet<4>(b), mask); } else if (X == 1 && Y == 2) { const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); + return VecPermute(a, VecShiftLeftOctet<8>(b), mask); } else if (X == 1 && Y == 3) { const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); + return VecPermute(a, VecShiftLeftOctet<12>(b), mask); } // Element 2 combinations else if (X == 2 && Y == 0) { const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, b, mask); + return VecPermute(a, b, mask); } else if (X == 2 && Y == 1) { const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); + return VecPermute(a, VecShiftLeftOctet<4>(b), mask); } else if (X == 2 && Y == 2) { const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); + return VecPermute(a, VecShiftLeftOctet<8>(b), mask); } else if (X == 2 && Y == 3) { const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); + return VecPermute(a, VecShiftLeftOctet<12>(b), mask); } // Element 3 combinations else if (X == 3 && Y == 0) { const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, b, mask); + return VecPermute(a, b, mask); } else if (X == 3 && Y == 1) { const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); + return VecPermute(a, VecShiftLeftOctet<4>(b), mask); } else if (X == 3 && Y == 2) { const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); + return VecPermute(a, VecShiftLeftOctet<8>(b), mask); } else if (X == 3 && Y == 3) { const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); + return VecPermute(a, VecShiftLeftOctet<12>(b), mask); } } @@ -826,7 +826,7 @@ inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b, // Power7 follows SSE2's implementation, and this is _mm_set_epi32. const uint8x16_p mask = {20,21,22,23, 16,17,18,19, 4,5,6,7, 0,1,2,3}; - return vec_perm(t0, t1, mask); + return VecPermute(t0, t1, mask); } template<> @@ -835,7 +835,7 @@ uint32x4_p VectorSet32<2,0,2,0>(const uint32x4_p a, const uint32x4_p b, { // a=b, c=d, mask is {2,0, 2,0} const uint8x16_p mask = {16,17,18,19, 24,25,26,27, 0,1,2,3, 8,9,10,11}; - return vec_perm(a, c, mask); + return VecPermute(a, c, mask); } template<> @@ -844,7 +844,7 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b, { // a=b, c=d, mask is {3,1, 3,1} const uint8x16_p mask = {20,21,22,23, 28,29,30,31, 4,5,6,7, 12,13,14,15}; - return vec_perm(a, c, mask); + return VecPermute(a, c, mask); } void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) @@ -919,25 +919,25 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) #define BLAKE2S_LOAD_MSG_9_3(buf) buf = VectorSet32<13,3,9,15>(m13,m3,m9,m15) #define BLAKE2S_LOAD_MSG_9_4(buf) buf = VectorSet32<0,12,14,11>(m0,m12,m14,m11) - #define vec_ror_16(x) VectorRotateRight<16>(x) - #define vec_ror_12(x) VectorRotateRight<12>(x) - #define vec_ror_8(x) VectorRotateRight<8>(x) - #define vec_ror_7(x) VectorRotateRight<7>(x) + #define vec_ror_16(x) VecRotateRight<16>(x) + #define vec_ror_12(x) VecRotateRight<12>(x) + #define vec_ror_8(x) VecRotateRight<8>(x) + #define vec_ror_7(x) VecRotateRight<7>(x) #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ - row1 = vec_add(vec_add(row1, buf), row2); \ - row4 = vec_xor(row4, row1); \ + row1 = VecAdd(VecAdd(row1, buf), row2); \ + row4 = VecXor(row4, row1); \ row4 = vec_ror_16(row4); \ - row3 = vec_add(row3, row4); \ - row2 = vec_xor(row2, row3); \ + row3 = VecAdd(row3, row4); \ + row2 = VecXor(row2, row3); \ row2 = vec_ror_12(row2); #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ - row1 = vec_add(vec_add(row1, buf), row2); \ - row4 = vec_xor(row4, row1); \ + row1 = VecAdd(VecAdd(row1, buf), row2); \ + row4 = VecXor(row4, row1); \ row4 = vec_ror_8(row4); \ - row3 = vec_add(row3, row4); \ - row2 = vec_xor(row2, row3); \ + row3 = VecAdd(row3, row4); \ + row2 = VecXor(row2, row3); \ row2 = vec_ror_7(row2); const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11}; @@ -945,14 +945,14 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3}; #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ - row4 = vec_perm(row4, row4, D2103_MASK); \ - row3 = vec_perm(row3, row3, D1032_MASK); \ - row2 = vec_perm(row2, row2, D0321_MASK); + row4 = VecPermute(row4, row4, D2103_MASK); \ + row3 = VecPermute(row3, row3, D1032_MASK); \ + row2 = VecPermute(row2, row2, D0321_MASK); #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ - row4 = vec_perm(row4, row4, D0321_MASK); \ - row3 = vec_perm(row3, row3, D1032_MASK); \ - row2 = vec_perm(row2, row2, D2103_MASK); + row4 = VecPermute(row4, row4, D0321_MASK); \ + row3 = VecPermute(row3, row3, D1032_MASK); \ + row2 = VecPermute(row2, row2, D2103_MASK); #define BLAKE2S_ROUND(r) \ BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ @@ -970,15 +970,15 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) uint32x4_p buf1, buf2, buf3, buf4; uint32x4_p ff0, ff1; - const uint32x4_p m0 = VectorLoad32LE(input + 0); - const uint32x4_p m4 = VectorLoad32LE(input + 16); - const uint32x4_p m8 = VectorLoad32LE(input + 32); - const uint32x4_p m12 = VectorLoad32LE(input + 48); + const uint32x4_p m0 = VecLoad32LE(input + 0); + const uint32x4_p m4 = VecLoad32LE(input + 16); + const uint32x4_p m8 = VecLoad32LE(input + 32); + const uint32x4_p m12 = VecLoad32LE(input + 48); - row1 = ff0 = VectorLoad32LE(&state.h[0]); - row2 = ff1 = VectorLoad32LE(&state.h[4]); - row3 = VectorLoad32(&BLAKE2S_IV[0]); - row4 = vec_xor(VectorLoad32(&BLAKE2S_IV[4]), VectorLoad32(&state.tf[0])); + row1 = ff0 = VecLoad32LE(&state.h[0]); + row2 = ff1 = VecLoad32LE(&state.h[4]); + row3 = VecLoad32(&BLAKE2S_IV[0]); + row4 = VecXor(VecLoad32(&BLAKE2S_IV[4]), VecLoad32(&state.tf[0])); BLAKE2S_ROUND(0); BLAKE2S_ROUND(1); @@ -991,8 +991,8 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) BLAKE2S_ROUND(8); BLAKE2S_ROUND(9); - VectorStore32LE(&state.h[0], vec_xor(ff0, vec_xor(row1, row3))); - VectorStore32LE(&state.h[4], vec_xor(ff1, vec_xor(row2, row4))); + VecStore32LE(&state.h[0], VecXor(ff0, VecXor(row1, row3))); + VecStore32LE(&state.h[4], VecXor(ff1, VecXor(row2, row4))); } #endif // CRYPTOPP_ALTIVEC_AVAILABLE diff --git a/chacha_simd.cpp b/chacha_simd.cpp index 97e78f49..9a0bd6c3 100644 --- a/chacha_simd.cpp +++ b/chacha_simd.cpp @@ -206,7 +206,7 @@ inline __m128i RotateLeft<16>(const __m128i val) #if (CRYPTOPP_ALTIVEC_AVAILABLE) // ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec -// is supported by using vec_ld and vec_st, and using a composite vec_add +// is supported by using vec_ld and vec_st, and using a composite VecAdd // that supports 64-bit element adds. vec_ld and vec_st add significant // overhead when memory is not aligned. Despite the drawbacks Altivec // is profitable. The numbers for ChaCha8 are: @@ -216,33 +216,34 @@ inline __m128i RotateLeft<16>(const __m128i val) using CryptoPP::uint8x16_p; using CryptoPP::uint32x4_p; -using CryptoPP::VectorLoad; -using CryptoPP::VectorStore; +using CryptoPP::VecLoad; +using CryptoPP::VecStore; +using CryptoPP::VecPermute; // Permutes bytes in packed 32-bit words to little endian. // State is already in proper endian order. Input and // output must be permuted during load and save. -inline uint32x4_p VectorLoad32LE(const uint8_t src[16]) +inline uint32x4_p VecLoad32LE(const uint8_t src[16]) { #if (CRYPTOPP_BIG_ENDIAN) const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; - const uint32x4_p val = VectorLoad(src); - return vec_perm(val, val, mask); + const uint32x4_p val = VecLoad(src); + return VecPermute(val, val, mask); #else - return VectorLoad(src); + return VecLoad(src); #endif } // Permutes bytes in packed 32-bit words to little endian. // State is already in proper endian order. Input and // output must be permuted during load and save. -inline void VectorStore32LE(uint8_t dest[16], const uint32x4_p& val) +inline void VecStore32LE(uint8_t dest[16], const uint32x4_p& val) { #if (CRYPTOPP_BIG_ENDIAN) const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; - VectorStore(vec_perm(val, val, mask), dest); + VecStore(VecPermute(val, val, mask), dest); #else - return VectorStore(val, dest); + return VecStore(val, dest); #endif } @@ -262,21 +263,21 @@ template <> inline uint32x4_p Shuffle<1>(const uint32x4_p& val) { const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3}; - return vec_perm(val, val, mask); + return VecPermute(val, val, mask); } template <> inline uint32x4_p Shuffle<2>(const uint32x4_p& val) { const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7}; - return vec_perm(val, val, mask); + return VecPermute(val, val, mask); } template <> inline uint32x4_p Shuffle<3>(const uint32x4_p& val) { const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11}; - return vec_perm(val, val, mask); + return VecPermute(val, val, mask); } #endif // CRYPTOPP_ALTIVEC_AVAILABLE @@ -825,10 +826,10 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte * void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds) { - const uint32x4_p state0 = VectorLoad(state + 0*4); - const uint32x4_p state1 = VectorLoad(state + 1*4); - const uint32x4_p state2 = VectorLoad(state + 2*4); - const uint32x4_p state3 = VectorLoad(state + 3*4); + const uint32x4_p state0 = VecLoad(state + 0*4); + const uint32x4_p state1 = VecLoad(state + 1*4); + const uint32x4_p state2 = VecLoad(state + 2*4); + const uint32x4_p state3 = VecLoad(state + 3*4); const uint32x4_p CTRS[3] = { {1,0,0,0}, {2,0,0,0}, {3,0,0,0} @@ -842,79 +843,79 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte uint32x4_p r1_0 = state0; uint32x4_p r1_1 = state1; uint32x4_p r1_2 = state2; - uint32x4_p r1_3 = VectorAdd64(r0_3, CTRS[0]); + uint32x4_p r1_3 = VecAdd64(r0_3, CTRS[0]); uint32x4_p r2_0 = state0; uint32x4_p r2_1 = state1; uint32x4_p r2_2 = state2; - uint32x4_p r2_3 = VectorAdd64(r0_3, CTRS[1]); + uint32x4_p r2_3 = VecAdd64(r0_3, CTRS[1]); uint32x4_p r3_0 = state0; uint32x4_p r3_1 = state1; uint32x4_p r3_2 = state2; - uint32x4_p r3_3 = VectorAdd64(r0_3, CTRS[2]); + uint32x4_p r3_3 = VecAdd64(r0_3, CTRS[2]); for (int i = static_cast(rounds); i > 0; i -= 2) { - r0_0 = VectorAdd(r0_0, r0_1); - r1_0 = VectorAdd(r1_0, r1_1); - r2_0 = VectorAdd(r2_0, r2_1); - r3_0 = VectorAdd(r3_0, r3_1); + r0_0 = VecAdd(r0_0, r0_1); + r1_0 = VecAdd(r1_0, r1_1); + r2_0 = VecAdd(r2_0, r2_1); + r3_0 = VecAdd(r3_0, r3_1); - r0_3 = VectorXor(r0_3, r0_0); - r1_3 = VectorXor(r1_3, r1_0); - r2_3 = VectorXor(r2_3, r2_0); - r3_3 = VectorXor(r3_3, r3_0); + r0_3 = VecXor(r0_3, r0_0); + r1_3 = VecXor(r1_3, r1_0); + r2_3 = VecXor(r2_3, r2_0); + r3_3 = VecXor(r3_3, r3_0); - r0_3 = VectorRotateLeft<16>(r0_3); - r1_3 = VectorRotateLeft<16>(r1_3); - r2_3 = VectorRotateLeft<16>(r2_3); - r3_3 = VectorRotateLeft<16>(r3_3); + r0_3 = VecRotateLeft<16>(r0_3); + r1_3 = VecRotateLeft<16>(r1_3); + r2_3 = VecRotateLeft<16>(r2_3); + r3_3 = VecRotateLeft<16>(r3_3); - r0_2 = VectorAdd(r0_2, r0_3); - r1_2 = VectorAdd(r1_2, r1_3); - r2_2 = VectorAdd(r2_2, r2_3); - r3_2 = VectorAdd(r3_2, r3_3); + r0_2 = VecAdd(r0_2, r0_3); + r1_2 = VecAdd(r1_2, r1_3); + r2_2 = VecAdd(r2_2, r2_3); + r3_2 = VecAdd(r3_2, r3_3); - r0_1 = VectorXor(r0_1, r0_2); - r1_1 = VectorXor(r1_1, r1_2); - r2_1 = VectorXor(r2_1, r2_2); - r3_1 = VectorXor(r3_1, r3_2); + r0_1 = VecXor(r0_1, r0_2); + r1_1 = VecXor(r1_1, r1_2); + r2_1 = VecXor(r2_1, r2_2); + r3_1 = VecXor(r3_1, r3_2); - r0_1 = VectorRotateLeft<12>(r0_1); - r1_1 = VectorRotateLeft<12>(r1_1); - r2_1 = VectorRotateLeft<12>(r2_1); - r3_1 = VectorRotateLeft<12>(r3_1); + r0_1 = VecRotateLeft<12>(r0_1); + r1_1 = VecRotateLeft<12>(r1_1); + r2_1 = VecRotateLeft<12>(r2_1); + r3_1 = VecRotateLeft<12>(r3_1); - r0_0 = VectorAdd(r0_0, r0_1); - r1_0 = VectorAdd(r1_0, r1_1); - r2_0 = VectorAdd(r2_0, r2_1); - r3_0 = VectorAdd(r3_0, r3_1); + r0_0 = VecAdd(r0_0, r0_1); + r1_0 = VecAdd(r1_0, r1_1); + r2_0 = VecAdd(r2_0, r2_1); + r3_0 = VecAdd(r3_0, r3_1); - r0_3 = VectorXor(r0_3, r0_0); - r1_3 = VectorXor(r1_3, r1_0); - r2_3 = VectorXor(r2_3, r2_0); - r3_3 = VectorXor(r3_3, r3_0); + r0_3 = VecXor(r0_3, r0_0); + r1_3 = VecXor(r1_3, r1_0); + r2_3 = VecXor(r2_3, r2_0); + r3_3 = VecXor(r3_3, r3_0); - r0_3 = VectorRotateLeft<8>(r0_3); - r1_3 = VectorRotateLeft<8>(r1_3); - r2_3 = VectorRotateLeft<8>(r2_3); - r3_3 = VectorRotateLeft<8>(r3_3); + r0_3 = VecRotateLeft<8>(r0_3); + r1_3 = VecRotateLeft<8>(r1_3); + r2_3 = VecRotateLeft<8>(r2_3); + r3_3 = VecRotateLeft<8>(r3_3); - r0_2 = VectorAdd(r0_2, r0_3); - r1_2 = VectorAdd(r1_2, r1_3); - r2_2 = VectorAdd(r2_2, r2_3); - r3_2 = VectorAdd(r3_2, r3_3); + r0_2 = VecAdd(r0_2, r0_3); + r1_2 = VecAdd(r1_2, r1_3); + r2_2 = VecAdd(r2_2, r2_3); + r3_2 = VecAdd(r3_2, r3_3); - r0_1 = VectorXor(r0_1, r0_2); - r1_1 = VectorXor(r1_1, r1_2); - r2_1 = VectorXor(r2_1, r2_2); - r3_1 = VectorXor(r3_1, r3_2); + r0_1 = VecXor(r0_1, r0_2); + r1_1 = VecXor(r1_1, r1_2); + r2_1 = VecXor(r2_1, r2_2); + r3_1 = VecXor(r3_1, r3_2); - r0_1 = VectorRotateLeft<7>(r0_1); - r1_1 = VectorRotateLeft<7>(r1_1); - r2_1 = VectorRotateLeft<7>(r2_1); - r3_1 = VectorRotateLeft<7>(r3_1); + r0_1 = VecRotateLeft<7>(r0_1); + r1_1 = VecRotateLeft<7>(r1_1); + r2_1 = VecRotateLeft<7>(r2_1); + r3_1 = VecRotateLeft<7>(r3_1); r0_1 = Shuffle<1>(r0_1); r0_2 = Shuffle<2>(r0_2); @@ -932,65 +933,65 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r3_2 = Shuffle<2>(r3_2); r3_3 = Shuffle<3>(r3_3); - r0_0 = VectorAdd(r0_0, r0_1); - r1_0 = VectorAdd(r1_0, r1_1); - r2_0 = VectorAdd(r2_0, r2_1); - r3_0 = VectorAdd(r3_0, r3_1); + r0_0 = VecAdd(r0_0, r0_1); + r1_0 = VecAdd(r1_0, r1_1); + r2_0 = VecAdd(r2_0, r2_1); + r3_0 = VecAdd(r3_0, r3_1); - r0_3 = VectorXor(r0_3, r0_0); - r1_3 = VectorXor(r1_3, r1_0); - r2_3 = VectorXor(r2_3, r2_0); - r3_3 = VectorXor(r3_3, r3_0); + r0_3 = VecXor(r0_3, r0_0); + r1_3 = VecXor(r1_3, r1_0); + r2_3 = VecXor(r2_3, r2_0); + r3_3 = VecXor(r3_3, r3_0); - r0_3 = VectorRotateLeft<16>(r0_3); - r1_3 = VectorRotateLeft<16>(r1_3); - r2_3 = VectorRotateLeft<16>(r2_3); - r3_3 = VectorRotateLeft<16>(r3_3); + r0_3 = VecRotateLeft<16>(r0_3); + r1_3 = VecRotateLeft<16>(r1_3); + r2_3 = VecRotateLeft<16>(r2_3); + r3_3 = VecRotateLeft<16>(r3_3); - r0_2 = VectorAdd(r0_2, r0_3); - r1_2 = VectorAdd(r1_2, r1_3); - r2_2 = VectorAdd(r2_2, r2_3); - r3_2 = VectorAdd(r3_2, r3_3); + r0_2 = VecAdd(r0_2, r0_3); + r1_2 = VecAdd(r1_2, r1_3); + r2_2 = VecAdd(r2_2, r2_3); + r3_2 = VecAdd(r3_2, r3_3); - r0_1 = VectorXor(r0_1, r0_2); - r1_1 = VectorXor(r1_1, r1_2); - r2_1 = VectorXor(r2_1, r2_2); - r3_1 = VectorXor(r3_1, r3_2); + r0_1 = VecXor(r0_1, r0_2); + r1_1 = VecXor(r1_1, r1_2); + r2_1 = VecXor(r2_1, r2_2); + r3_1 = VecXor(r3_1, r3_2); - r0_1 = VectorRotateLeft<12>(r0_1); - r1_1 = VectorRotateLeft<12>(r1_1); - r2_1 = VectorRotateLeft<12>(r2_1); - r3_1 = VectorRotateLeft<12>(r3_1); + r0_1 = VecRotateLeft<12>(r0_1); + r1_1 = VecRotateLeft<12>(r1_1); + r2_1 = VecRotateLeft<12>(r2_1); + r3_1 = VecRotateLeft<12>(r3_1); - r0_0 = VectorAdd(r0_0, r0_1); - r1_0 = VectorAdd(r1_0, r1_1); - r2_0 = VectorAdd(r2_0, r2_1); - r3_0 = VectorAdd(r3_0, r3_1); + r0_0 = VecAdd(r0_0, r0_1); + r1_0 = VecAdd(r1_0, r1_1); + r2_0 = VecAdd(r2_0, r2_1); + r3_0 = VecAdd(r3_0, r3_1); - r0_3 = VectorXor(r0_3, r0_0); - r1_3 = VectorXor(r1_3, r1_0); - r2_3 = VectorXor(r2_3, r2_0); - r3_3 = VectorXor(r3_3, r3_0); + r0_3 = VecXor(r0_3, r0_0); + r1_3 = VecXor(r1_3, r1_0); + r2_3 = VecXor(r2_3, r2_0); + r3_3 = VecXor(r3_3, r3_0); - r0_3 = VectorRotateLeft<8>(r0_3); - r1_3 = VectorRotateLeft<8>(r1_3); - r2_3 = VectorRotateLeft<8>(r2_3); - r3_3 = VectorRotateLeft<8>(r3_3); + r0_3 = VecRotateLeft<8>(r0_3); + r1_3 = VecRotateLeft<8>(r1_3); + r2_3 = VecRotateLeft<8>(r2_3); + r3_3 = VecRotateLeft<8>(r3_3); - r0_2 = VectorAdd(r0_2, r0_3); - r1_2 = VectorAdd(r1_2, r1_3); - r2_2 = VectorAdd(r2_2, r2_3); - r3_2 = VectorAdd(r3_2, r3_3); + r0_2 = VecAdd(r0_2, r0_3); + r1_2 = VecAdd(r1_2, r1_3); + r2_2 = VecAdd(r2_2, r2_3); + r3_2 = VecAdd(r3_2, r3_3); - r0_1 = VectorXor(r0_1, r0_2); - r1_1 = VectorXor(r1_1, r1_2); - r2_1 = VectorXor(r2_1, r2_2); - r3_1 = VectorXor(r3_1, r3_2); + r0_1 = VecXor(r0_1, r0_2); + r1_1 = VecXor(r1_1, r1_2); + r2_1 = VecXor(r2_1, r2_2); + r3_1 = VecXor(r3_1, r3_2); - r0_1 = VectorRotateLeft<7>(r0_1); - r1_1 = VectorRotateLeft<7>(r1_1); - r2_1 = VectorRotateLeft<7>(r2_1); - r3_1 = VectorRotateLeft<7>(r3_1); + r0_1 = VecRotateLeft<7>(r0_1); + r1_1 = VecRotateLeft<7>(r1_1); + r2_1 = VecRotateLeft<7>(r2_1); + r3_1 = VecRotateLeft<7>(r3_1); r0_1 = Shuffle<3>(r0_1); r0_2 = Shuffle<2>(r0_2); @@ -1009,80 +1010,80 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r3_3 = Shuffle<1>(r3_3); } - r0_0 = VectorAdd(r0_0, state0); - r0_1 = VectorAdd(r0_1, state1); - r0_2 = VectorAdd(r0_2, state2); - r0_3 = VectorAdd(r0_3, state3); + r0_0 = VecAdd(r0_0, state0); + r0_1 = VecAdd(r0_1, state1); + r0_2 = VecAdd(r0_2, state2); + r0_3 = VecAdd(r0_3, state3); - r1_0 = VectorAdd(r1_0, state0); - r1_1 = VectorAdd(r1_1, state1); - r1_2 = VectorAdd(r1_2, state2); - r1_3 = VectorAdd(r1_3, state3); - r1_3 = VectorAdd64(r1_3, CTRS[0]); + r1_0 = VecAdd(r1_0, state0); + r1_1 = VecAdd(r1_1, state1); + r1_2 = VecAdd(r1_2, state2); + r1_3 = VecAdd(r1_3, state3); + r1_3 = VecAdd64(r1_3, CTRS[0]); - r2_0 = VectorAdd(r2_0, state0); - r2_1 = VectorAdd(r2_1, state1); - r2_2 = VectorAdd(r2_2, state2); - r2_3 = VectorAdd(r2_3, state3); - r2_3 = VectorAdd64(r2_3, CTRS[1]); + r2_0 = VecAdd(r2_0, state0); + r2_1 = VecAdd(r2_1, state1); + r2_2 = VecAdd(r2_2, state2); + r2_3 = VecAdd(r2_3, state3); + r2_3 = VecAdd64(r2_3, CTRS[1]); - r3_0 = VectorAdd(r3_0, state0); - r3_1 = VectorAdd(r3_1, state1); - r3_2 = VectorAdd(r3_2, state2); - r3_3 = VectorAdd(r3_3, state3); - r3_3 = VectorAdd64(r3_3, CTRS[2]); + r3_0 = VecAdd(r3_0, state0); + r3_1 = VecAdd(r3_1, state1); + r3_2 = VecAdd(r3_2, state2); + r3_3 = VecAdd(r3_3, state3); + r3_3 = VecAdd64(r3_3, CTRS[2]); if (input) { - r0_0 = VectorXor(VectorLoad32LE(input + 0*16), r0_0); - r0_1 = VectorXor(VectorLoad32LE(input + 1*16), r0_1); - r0_2 = VectorXor(VectorLoad32LE(input + 2*16), r0_2); - r0_3 = VectorXor(VectorLoad32LE(input + 3*16), r0_3); + r0_0 = VecXor(VecLoad32LE(input + 0*16), r0_0); + r0_1 = VecXor(VecLoad32LE(input + 1*16), r0_1); + r0_2 = VecXor(VecLoad32LE(input + 2*16), r0_2); + r0_3 = VecXor(VecLoad32LE(input + 3*16), r0_3); } - VectorStore32LE(output + 0*16, r0_0); - VectorStore32LE(output + 1*16, r0_1); - VectorStore32LE(output + 2*16, r0_2); - VectorStore32LE(output + 3*16, r0_3); + VecStore32LE(output + 0*16, r0_0); + VecStore32LE(output + 1*16, r0_1); + VecStore32LE(output + 2*16, r0_2); + VecStore32LE(output + 3*16, r0_3); if (input) { - r1_0 = VectorXor(VectorLoad32LE(input + 4*16), r1_0); - r1_1 = VectorXor(VectorLoad32LE(input + 5*16), r1_1); - r1_2 = VectorXor(VectorLoad32LE(input + 6*16), r1_2); - r1_3 = VectorXor(VectorLoad32LE(input + 7*16), r1_3); + r1_0 = VecXor(VecLoad32LE(input + 4*16), r1_0); + r1_1 = VecXor(VecLoad32LE(input + 5*16), r1_1); + r1_2 = VecXor(VecLoad32LE(input + 6*16), r1_2); + r1_3 = VecXor(VecLoad32LE(input + 7*16), r1_3); } - VectorStore32LE(output + 4*16, r1_0); - VectorStore32LE(output + 5*16, r1_1); - VectorStore32LE(output + 6*16, r1_2); - VectorStore32LE(output + 7*16, r1_3); + VecStore32LE(output + 4*16, r1_0); + VecStore32LE(output + 5*16, r1_1); + VecStore32LE(output + 6*16, r1_2); + VecStore32LE(output + 7*16, r1_3); if (input) { - r2_0 = VectorXor(VectorLoad32LE(input + 8*16), r2_0); - r2_1 = VectorXor(VectorLoad32LE(input + 9*16), r2_1); - r2_2 = VectorXor(VectorLoad32LE(input + 10*16), r2_2); - r2_3 = VectorXor(VectorLoad32LE(input + 11*16), r2_3); + r2_0 = VecXor(VecLoad32LE(input + 8*16), r2_0); + r2_1 = VecXor(VecLoad32LE(input + 9*16), r2_1); + r2_2 = VecXor(VecLoad32LE(input + 10*16), r2_2); + r2_3 = VecXor(VecLoad32LE(input + 11*16), r2_3); } - VectorStore32LE(output + 8*16, r2_0); - VectorStore32LE(output + 9*16, r2_1); - VectorStore32LE(output + 10*16, r2_2); - VectorStore32LE(output + 11*16, r2_3); + VecStore32LE(output + 8*16, r2_0); + VecStore32LE(output + 9*16, r2_1); + VecStore32LE(output + 10*16, r2_2); + VecStore32LE(output + 11*16, r2_3); if (input) { - r3_0 = VectorXor(VectorLoad32LE(input + 12*16), r3_0); - r3_1 = VectorXor(VectorLoad32LE(input + 13*16), r3_1); - r3_2 = VectorXor(VectorLoad32LE(input + 14*16), r3_2); - r3_3 = VectorXor(VectorLoad32LE(input + 15*16), r3_3); + r3_0 = VecXor(VecLoad32LE(input + 12*16), r3_0); + r3_1 = VecXor(VecLoad32LE(input + 13*16), r3_1); + r3_2 = VecXor(VecLoad32LE(input + 14*16), r3_2); + r3_3 = VecXor(VecLoad32LE(input + 15*16), r3_3); } - VectorStore32LE(output + 12*16, r3_0); - VectorStore32LE(output + 13*16, r3_1); - VectorStore32LE(output + 14*16, r3_2); - VectorStore32LE(output + 15*16, r3_3); + VecStore32LE(output + 12*16, r3_0); + VecStore32LE(output + 13*16, r3_1); + VecStore32LE(output + 14*16, r3_2); + VecStore32LE(output + 15*16, r3_3); } #endif // CRYPTOPP_ALTIVEC_AVAILABLE diff --git a/gcm_simd.cpp b/gcm_simd.cpp index 2b054898..3e42f16b 100644 --- a/gcm_simd.cpp +++ b/gcm_simd.cpp @@ -171,16 +171,16 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) #if CRYPTOPP_POWER8_VMULL_AVAILABLE using CryptoPP::uint32x4_p; using CryptoPP::uint64x2_p; -using CryptoPP::VectorGetLow; -using CryptoPP::VectorGetHigh; -using CryptoPP::VectorRotateLeftOctet; +using CryptoPP::VecGetLow; +using CryptoPP::VecGetHigh; +using CryptoPP::VecRotateLeftOctet; // POWER8 GCM mode is confusing. The algorithm is reflected so // nearly everything we do is reversed for a little-endian system, // including on big-endian machines. VMULL2LE swaps dwords for a // little endian machine; VMULL_00LE, VMULL_01LE, VMULL_10LE and // VMULL_11LE are backwards and (1) read low words with -// VectorGetHigh, (2) read high words with VectorGetLow, and +// VecGetHigh, (2) read high words with VecGetLow, and // (3) yields a product that is endian swapped. The steps ensures // GCM parameters are presented in the correct order for the // algorithm on both big and little-endian systems, but it is @@ -192,7 +192,7 @@ using CryptoPP::VectorRotateLeftOctet; inline uint64x2_p VMULL2LE(const uint64x2_p& val) { #if (CRYPTOPP_BIG_ENDIAN) - return VectorRotateLeftOctet<8>(val); + return VecRotateLeftOctet<8>(val); #else return val; #endif @@ -202,48 +202,48 @@ inline uint64x2_p VMULL2LE(const uint64x2_p& val) inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return VMULL2LE(__vpmsumd (VectorGetHigh(a), VectorGetHigh(b))); + return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b))); #else - return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetHigh(a), VectorGetHigh(b))); + return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b))); #endif } // _mm_clmulepi64_si128(a, b, 0x01) inline uint64x2_p VMULL_01LE(const uint64x2_p& a, const uint64x2_p& b) { - // Small speedup. VectorGetHigh(b) ensures the high dword of 'b' is 0. + // Small speedup. VecGetHigh(b) ensures the high dword of 'b' is 0. // The 0 used in the vmull yields 0 for the high product, so the high // dword of 'a' is "don't care". #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return VMULL2LE(__vpmsumd (a, VectorGetHigh(b))); + return VMULL2LE(__vpmsumd (a, VecGetHigh(b))); #else - return VMULL2LE(__builtin_crypto_vpmsumd (a, VectorGetHigh(b))); + return VMULL2LE(__builtin_crypto_vpmsumd (a, VecGetHigh(b))); #endif } // _mm_clmulepi64_si128(a, b, 0x10) inline uint64x2_p VMULL_10LE(const uint64x2_p& a, const uint64x2_p& b) { - // Small speedup. VectorGetHigh(a) ensures the high dword of 'a' is 0. + // Small speedup. VecGetHigh(a) ensures the high dword of 'a' is 0. // The 0 used in the vmull yields 0 for the high product, so the high // dword of 'b' is "don't care". #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return VMULL2LE(__vpmsumd (VectorGetHigh(a), b)); + return VMULL2LE(__vpmsumd (VecGetHigh(a), b)); #else - return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetHigh(a), b)); + return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), b)); #endif } // _mm_clmulepi64_si128(a, b, 0x11) inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b) { - // Small speedup. VectorGetLow(a) ensures the high dword of 'a' is 0. + // Small speedup. VecGetLow(a) ensures the high dword of 'a' is 0. // The 0 used in the vmull yields 0 for the high product, so the high // dword of 'b' is "don't care". #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return VMULL2LE(__vpmsumd (VectorGetLow(a), b)); + return VMULL2LE(__vpmsumd (VecGetLow(a), b)); #else - return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetLow(a), b)); + return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b)); #endif } #endif // CRYPTOPP_POWER8_VMULL_AVAILABLE @@ -373,7 +373,7 @@ bool CPU_ProbePMULL() const uint64x2_p r3 = VMULL_10LE((uint64x2_p)(a), (uint64x2_p)(b)); const uint64x2_p r4 = VMULL_11LE((uint64x2_p)(a), (uint64x2_p)(b)); - result = VectorNotEqual(r1, r2) && VectorNotEqual(r3, r4); + result = VecNotEqual(r1, r2) && VecNotEqual(r3, r4); } sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); @@ -743,7 +743,7 @@ void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer) #if CRYPTOPP_ALTIVEC_AVAILABLE void GCM_Xor16_ALTIVEC(byte *a, const byte *b, const byte *c) { - VectorStore(VectorXor(VectorLoad(b), VectorLoad(c)), a); + VecStore(VecXor(VecLoad(b), VecLoad(c)), a); } #endif // CRYPTOPP_ALTIVEC_AVAILABLE @@ -753,22 +753,22 @@ uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x { const uint64x2_p m1 = {1,1}, m63 = {63,63}; - c1 = VectorXor(c1, VectorShiftRightOctet<8>(c0)); - c1 = VectorXor(c1, VMULL_10LE(c0, r)); - c0 = VectorXor(c1, VectorShiftLeftOctet<8>(c0)); + c1 = VecXor(c1, VecShiftRightOctet<8>(c0)); + c1 = VecXor(c1, VMULL_10LE(c0, r)); + c0 = VecXor(c1, VecShiftLeftOctet<8>(c0)); c0 = VMULL_00LE(vec_sl(c0, m1), r); - c2 = VectorXor(c2, c0); - c2 = VectorXor(c2, VectorShiftLeftOctet<8>(c1)); + c2 = VecXor(c2, c0); + c2 = VecXor(c2, VecShiftLeftOctet<8>(c1)); c1 = vec_sr(vec_mergeh(c1, c2), m63); c2 = vec_sl(c2, m1); - return VectorXor(c2, c1); + return VecXor(c2, c1); } inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r) { const uint64x2_p c0 = VMULL_00LE(x, h); - const uint64x2_p c1 = VectorXor(VMULL_01LE(x, h), VMULL_10LE(x, h)); + const uint64x2_p c1 = VecXor(VMULL_01LE(x, h), VMULL_10LE(x, h)); const uint64x2_p c2 = VMULL_11LE(x, h); return GCM_Reduce_VMULL(c0, c1, c2, r); @@ -777,13 +777,13 @@ inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r) inline uint64x2_p LoadHashKey(const byte *hashKey) { #if (CRYPTOPP_BIG_ENDIAN) - const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey); + const uint64x2_p key = (uint64x2_p)VecLoad(hashKey); const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7}; - return vec_perm(key, key, mask); + return VecPermute(key, key, mask); #else - const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey); + const uint64x2_p key = (uint64x2_p)VecLoad(hashKey); const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - return vec_perm(key, key, mask); + return VecPermute(key, key, mask); #endif } @@ -798,21 +798,21 @@ void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned for (i=0; i inline T SwapWords(const T& data) { - return (T)VectorRotateLeftOctet<8>(data); + return (T)VecRotateLeftOctet<8>(data); } inline uint64x2_p LoadBuffer1(const byte *dataBuffer) { #if (CRYPTOPP_BIG_ENDIAN) - return (uint64x2_p)VectorLoad(dataBuffer); + return (uint64x2_p)VecLoad(dataBuffer); #else - const uint64x2_p data = (uint64x2_p)VectorLoad(dataBuffer); + const uint64x2_p data = (uint64x2_p)VecLoad(dataBuffer); const uint8x16_p mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8}; - return vec_perm(data, data, mask); + return VecPermute(data, data, mask); #endif } inline uint64x2_p LoadBuffer2(const byte *dataBuffer) { #if (CRYPTOPP_BIG_ENDIAN) - return (uint64x2_p)SwapWords(VectorLoadBE(dataBuffer)); + return (uint64x2_p)SwapWords(VecLoadBE(dataBuffer)); #else - return (uint64x2_p)VectorLoadBE(dataBuffer); + return (uint64x2_p)VecLoadBE(dataBuffer); #endif } size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer) { const uint64x2_p r = {0xe100000000000000ull, 0xc200000000000000ull}; - uint64x2_p x = (uint64x2_p)VectorLoad(hbuffer); + uint64x2_p x = (uint64x2_p)VecLoad(hbuffer); while (len >= 16) { @@ -856,59 +856,59 @@ size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mt while (true) { - const uint64x2_p h0 = (uint64x2_p)VectorLoad(mtable+(i+0)*16); - const uint64x2_p h1 = (uint64x2_p)VectorLoad(mtable+(i+1)*16); - const uint64x2_p h2 = (uint64x2_p)VectorXor(h0, h1); + const uint64x2_p h0 = (uint64x2_p)VecLoad(mtable+(i+0)*16); + const uint64x2_p h1 = (uint64x2_p)VecLoad(mtable+(i+1)*16); + const uint64x2_p h2 = (uint64x2_p)VecXor(h0, h1); if (++i == s) { d1 = LoadBuffer2(data); - d1 = VectorXor(d1, x); - c0 = VectorXor(c0, VMULL_00LE(d1, h0)); - c2 = VectorXor(c2, VMULL_01LE(d1, h1)); - d1 = VectorXor(d1, SwapWords(d1)); - c1 = VectorXor(c1, VMULL_00LE(d1, h2)); + d1 = VecXor(d1, x); + c0 = VecXor(c0, VMULL_00LE(d1, h0)); + c2 = VecXor(c2, VMULL_01LE(d1, h1)); + d1 = VecXor(d1, SwapWords(d1)); + c1 = VecXor(c1, VMULL_00LE(d1, h2)); break; } d1 = LoadBuffer1(data+(s-i)*16-8); - c0 = VectorXor(c0, VMULL_01LE(d2, h0)); - c2 = VectorXor(c2, VMULL_01LE(d1, h1)); - d2 = VectorXor(d2, d1); - c1 = VectorXor(c1, VMULL_01LE(d2, h2)); + c0 = VecXor(c0, VMULL_01LE(d2, h0)); + c2 = VecXor(c2, VMULL_01LE(d1, h1)); + d2 = VecXor(d2, d1); + c1 = VecXor(c1, VMULL_01LE(d2, h2)); if (++i == s) { d1 = LoadBuffer2(data); - d1 = VectorXor(d1, x); - c0 = VectorXor(c0, VMULL_10LE(d1, h0)); - c2 = VectorXor(c2, VMULL_11LE(d1, h1)); - d1 = VectorXor(d1, SwapWords(d1)); - c1 = VectorXor(c1, VMULL_10LE(d1, h2)); + d1 = VecXor(d1, x); + c0 = VecXor(c0, VMULL_10LE(d1, h0)); + c2 = VecXor(c2, VMULL_11LE(d1, h1)); + d1 = VecXor(d1, SwapWords(d1)); + c1 = VecXor(c1, VMULL_10LE(d1, h2)); break; } d2 = LoadBuffer2(data+(s-i)*16-8); - c0 = VectorXor(c0, VMULL_10LE(d1, h0)); - c2 = VectorXor(c2, VMULL_10LE(d2, h1)); - d1 = VectorXor(d1, d2); - c1 = VectorXor(c1, VMULL_10LE(d1, h2)); + c0 = VecXor(c0, VMULL_10LE(d1, h0)); + c2 = VecXor(c2, VMULL_10LE(d2, h1)); + d1 = VecXor(d1, d2); + c1 = VecXor(c1, VMULL_10LE(d1, h2)); } data += s*16; len -= s*16; - c1 = VectorXor(VectorXor(c1, c0), c2); + c1 = VecXor(VecXor(c1, c0), c2); x = GCM_Reduce_VMULL(c0, c1, c2, r); } - VectorStore(x, hbuffer); + VecStore(x, hbuffer); return len; } void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer) { const uint64x2_p mask = {0x08090a0b0c0d0e0full, 0x0001020304050607ull}; - VectorStore(VectorPermute(VectorLoad(hashBuffer), mask), hashBuffer); + VecStore(VecPermute(VecLoad(hashBuffer), mask), hashBuffer); } #endif // CRYPTOPP_POWER8_VMULL_AVAILABLE diff --git a/lea_simd.cpp b/lea_simd.cpp index 245407bf..8278330e 100644 --- a/lea_simd.cpp +++ b/lea_simd.cpp @@ -439,17 +439,17 @@ using CryptoPP::uint64x2_p; inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b) { - return vec_xor(a, b); + return VecXor(a, b); } inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b) { - return vec_add(a, b); + return VecAdd(a, b); } inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b) { - return vec_sub(a, b); + return VecSub(a, b); } template @@ -479,7 +479,7 @@ inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uin CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); CRYPTOPP_ASSERT(0); - return vec_xor(a, a); + return VecXor(a, a); } template <> @@ -519,7 +519,7 @@ inline uint32x4_p UnpackSIMD(const uint32x4_p& v) { // Should not be instantiated CRYPTOPP_ASSERT(0); - return vec_xor(v, v); + return VecXor(v, v); } template <> @@ -527,7 +527,7 @@ inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v) { // Splat to all lanes const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0}; - return (uint32x4_p)vec_perm(v, v, m); + return (uint32x4_p)VecPermute(v, v, m); } template <> @@ -535,7 +535,7 @@ inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v) { // Splat to all lanes const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4}; - return (uint32x4_p)vec_perm(v, v, m); + return (uint32x4_p)VecPermute(v, v, m); } template <> @@ -543,7 +543,7 @@ inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v) { // Splat to all lanes const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8}; - return (uint32x4_p)vec_perm(v, v, m); + return (uint32x4_p)VecPermute(v, v, m); } template <> @@ -551,7 +551,7 @@ inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v) { // Splat to all lanes const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12}; - return (uint32x4_p)vec_perm(v, v, m); + return (uint32x4_p)VecPermute(v, v, m); } template diff --git a/ppc_simd.cpp b/ppc_simd.cpp index 99616c4f..7fcbedca 100644 --- a/ppc_simd.cpp +++ b/ppc_simd.cpp @@ -73,7 +73,7 @@ bool CPU_ProbeAltivec() // Specifically call the Altivec loads and stores const uint8x16_p v1 = (uint8x16_p)vec_ld(0, (byte*)b1); const uint8x16_p v2 = (uint8x16_p)vec_ld(0, (byte*)b2); - const uint8x16_p v3 = (uint8x16_p)vec_xor(v1, v2); + const uint8x16_p v3 = (uint8x16_p)VecXor(v1, v2); vec_st(v3, 0, b3); result = (0 == std::memcmp(b2, b3, 16)); diff --git a/ppc_simd.h b/ppc_simd.h index 7dcd37eb..d697f7e4 100644 --- a/ppc_simd.h +++ b/ppc_simd.h @@ -29,7 +29,7 @@ # undef bool #endif -// VectorLoad_ALTIVEC and VectorStore_ALTIVEC are +// VecLoad_ALTIVEC and VecStore_ALTIVEC are // too noisy on modern compilers #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE # pragma GCC diagnostic push @@ -49,14 +49,14 @@ typedef __vector unsigned int uint32x4_p; typedef __vector unsigned long long uint64x2_p; #endif // _ARCH_PWR8 -/// \brief Reverse a vector +/// \brief Reverse bytes in a vector /// \tparam T vector type /// \param src the vector /// \returns vector -/// \details Reverse() endian swaps the bytes in a vector +/// \details VecReverse() reverses the bytes in a vector /// \since Crypto++ 6.0 template -inline T Reverse(const T src) +inline T VecReverse(const T src) { const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; return (T)vec_perm(src, src, mask); @@ -67,16 +67,16 @@ inline T Reverse(const T src) /// \brief Loads a vector from a byte array /// \param src the byte array /// \details Loads a vector in native endian format from a byte array. -/// \details VectorLoad_ALTIVEC() uses vec_ld if the effective address +/// \details VecLoad_ALTIVEC() uses vec_ld if the effective address /// of dest is aligned, and uses vec_lvsl and vec_perm /// otherwise. /// vec_lvsl and vec_perm are relatively expensive so you should /// provide aligned memory adresses. -/// \details VectorLoad_ALTIVEC() is used automatically when POWER7 or above +/// \details VecLoad_ALTIVEC() is used automatically when POWER7 or above /// and unaligned loads is not available. -/// \note VectorLoad does not require an aligned array. +/// \note VecLoad does not require an aligned array. /// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad_ALTIVEC(const byte src[16]) +inline uint32x4_p VecLoad_ALTIVEC(const byte src[16]) { if (IsAlignedOn(src, 16)) { @@ -96,14 +96,14 @@ inline uint32x4_p VectorLoad_ALTIVEC(const byte src[16]) /// \param src the byte array /// \param off offset into the src byte array /// \details Loads a vector in native endian format from a byte array. -/// \details VectorLoad_ALTIVEC() uses vec_ld if the effective address +/// \details VecLoad_ALTIVEC() uses vec_ld if the effective address /// of dest is aligned, and uses vec_lvsl and vec_perm /// otherwise. /// vec_lvsl and vec_perm are relatively expensive so you should /// provide aligned memory adresses. -/// \note VectorLoad does not require an aligned array. +/// \note VecLoad does not require an aligned array. /// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad_ALTIVEC(int off, const byte src[16]) +inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16]) { if (IsAlignedOn(src, 16)) { @@ -122,14 +122,14 @@ inline uint32x4_p VectorLoad_ALTIVEC(int off, const byte src[16]) /// \brief Loads a vector from a byte array /// \param src the byte array /// \details Loads a vector in native endian format from a byte array. -/// \details VectorLoad uses POWER7's vec_xl or +/// \details VecLoad uses POWER7's vec_xl or /// vec_vsx_ld if available. The instructions do not require /// an aligned memory address. -/// \details VectorLoad_ALTIVEC() is used if POWER7 or above -/// is not available. VectorLoad_ALTIVEC() is relatively expensive. -/// \note VectorLoad does not require an aligned array. +/// \details VecLoad_ALTIVEC() is used if POWER7 or above +/// is not available. VecLoad_ALTIVEC() is relatively expensive. +/// \note VecLoad does not require an aligned array. /// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(const byte src[16]) +inline uint32x4_p VecLoad(const byte src[16]) { #if defined(_ARCH_PWR7) # if defined(__xlc__) || defined(__xlC__) || defined(__clang__) @@ -138,7 +138,7 @@ inline uint32x4_p VectorLoad(const byte src[16]) return (uint32x4_p)vec_vsx_ld(0, (byte*)src); # endif #else - return VectorLoad_ALTIVEC(src); + return VecLoad_ALTIVEC(src); #endif } @@ -146,14 +146,14 @@ inline uint32x4_p VectorLoad(const byte src[16]) /// \param src the byte array /// \param off offset into the byte array /// \details Loads a vector in native endian format from a byte array. -/// \details VectorLoad uses POWER7's vec_xl or +/// \details VecLoad uses POWER7's vec_xl or /// vec_vsx_ld if available. The instructions do not require /// an aligned memory address. -/// \details VectorLoad_ALTIVEC() is used if POWER7 or above -/// is not available. VectorLoad_ALTIVEC() is relatively expensive. -/// \note VectorLoad does not require an aligned array. +/// \details VecLoad_ALTIVEC() is used if POWER7 or above +/// is not available. VecLoad_ALTIVEC() is relatively expensive. +/// \note VecLoad does not require an aligned array. /// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(int off, const byte src[16]) +inline uint32x4_p VecLoad(int off, const byte src[16]) { #if defined(_ARCH_PWR7) # if defined(__xlc__) || defined(__xlC__) || defined(__clang__) @@ -162,48 +162,48 @@ inline uint32x4_p VectorLoad(int off, const byte src[16]) return (uint32x4_p)vec_vsx_ld(off, (byte*)src); # endif #else - return VectorLoad_ALTIVEC(off, src); + return VecLoad_ALTIVEC(off, src); #endif } /// \brief Loads a vector from a byte array /// \param src the byte array /// \details Loads a vector in native endian format from a byte array. -/// \details VectorLoad uses POWER7's vec_xl or +/// \details VecLoad uses POWER7's vec_xl or /// vec_vsx_ld if available. The instructions do not require /// an aligned memory address. -/// \details VectorLoad_ALTIVEC() is used if POWER7 or above -/// is not available. VectorLoad_ALTIVEC() is relatively expensive. -/// \note VectorLoad does not require an aligned array. +/// \details VecLoad_ALTIVEC() is used if POWER7 or above +/// is not available. VecLoad_ALTIVEC() is relatively expensive. +/// \note VecLoad does not require an aligned array. /// \since Crypto++ 8.0 -inline uint32x4_p VectorLoad(const word32 src[4]) +inline uint32x4_p VecLoad(const word32 src[4]) { - return VectorLoad((const byte*)src); + return VecLoad((const byte*)src); } /// \brief Loads a vector from a byte array /// \param src the byte array /// \param off offset into the byte array /// \details Loads a vector in native endian format from a byte array. -/// \note VectorLoad does not require an aligned array. +/// \note VecLoad does not require an aligned array. /// \since Crypto++ 8.0 -inline uint32x4_p VectorLoad(int off, const word32 src[4]) +inline uint32x4_p VecLoad(int off, const word32 src[4]) { - return VectorLoad(off, (const byte*)src); + return VecLoad(off, (const byte*)src); } /// \brief Loads a vector from a byte array /// \param src the byte array /// \details Loads a vector in big endian format from a byte array. -/// VectorLoadBE will swap all bytes on little endian systems. -/// \details VectorLoadBE uses POWER7's vec_xl or +/// VecLoadBE will swap all bytes on little endian systems. +/// \details VecLoadBE uses POWER7's vec_xl or /// vec_vsx_ld if available. The instructions do not require /// an aligned memory address. -/// \details VectorLoad_ALTIVEC() is used if POWER7 or above -/// is not available. VectorLoad_ALTIVEC() is relatively expensive. -/// \note VectorLoadBE() does not require an aligned array. +/// \details VecLoad_ALTIVEC() is used if POWER7 or above +/// is not available. VecLoad_ALTIVEC() is relatively expensive. +/// \note VecLoadBE() does not require an aligned array. /// \since Crypto++ 6.0 -inline uint32x4_p VectorLoadBE(const byte src[16]) +inline uint32x4_p VecLoadBE(const byte src[16]) { #if defined(_ARCH_PWR7) # if defined(__xlc__) || defined(__xlC__) || defined(__clang__) @@ -212,14 +212,14 @@ inline uint32x4_p VectorLoadBE(const byte src[16]) # if (CRYPTOPP_BIG_ENDIAN) return (uint32x4_p)vec_vsx_ld(0, (byte*)src); # else - return (uint32x4_p)Reverse(vec_vsx_ld(0, (byte*)src)); + return (uint32x4_p)VecReverse(vec_vsx_ld(0, (byte*)src)); # endif # endif #else // _ARCH_PWR7 # if (CRYPTOPP_BIG_ENDIAN) - return (uint32x4_p)VectorLoad((const byte*)src); + return (uint32x4_p)VecLoad((const byte*)src); # else - return (uint32x4_p)Reverse(VectorLoad((const byte*)src)); + return (uint32x4_p)VecReverse(VecLoad((const byte*)src)); # endif #endif // _ARCH_PWR7 } @@ -228,15 +228,15 @@ inline uint32x4_p VectorLoadBE(const byte src[16]) /// \param src the byte array /// \param off offset into the src byte array /// \details Loads a vector in big endian format from a byte array. -/// VectorLoadBE will swap all bytes on little endian systems. -/// \details VectorLoadBE uses POWER7's vec_xl or +/// VecLoadBE will swap all bytes on little endian systems. +/// \details VecLoadBE uses POWER7's vec_xl or /// vec_vsx_ld if available. The instructions do not require /// an aligned memory address. -/// \details VectorLoad_ALTIVEC() is used if POWER7 or above -/// is not available. VectorLoad_ALTIVEC() is relatively expensive. -/// \note VectorLoadBE does not require an aligned array. +/// \details VecLoad_ALTIVEC() is used if POWER7 or above +/// is not available. VecLoad_ALTIVEC() is relatively expensive. +/// \note VecLoadBE does not require an aligned array. /// \since Crypto++ 6.0 -inline uint32x4_p VectorLoadBE(int off, const byte src[16]) +inline uint32x4_p VecLoadBE(int off, const byte src[16]) { #if defined(_ARCH_PWR7) # if defined(__xlc__) || defined(__xlC__) || defined(__clang__) @@ -245,14 +245,14 @@ inline uint32x4_p VectorLoadBE(int off, const byte src[16]) # if (CRYPTOPP_BIG_ENDIAN) return (uint32x4_p)vec_vsx_ld(off, (byte*)src); # else - return (uint32x4_p)Reverse(vec_vsx_ld(off, (byte*)src)); + return (uint32x4_p)VecReverse(vec_vsx_ld(off, (byte*)src)); # endif # endif #else // _ARCH_PWR7 # if (CRYPTOPP_BIG_ENDIAN) - return (uint32x4_p)VectorLoad(off, (const byte*)src); + return (uint32x4_p)VecLoad(off, (const byte*)src); # else - return (uint32x4_p)Reverse(VectorLoad(off, (const byte*)src)); + return (uint32x4_p)VecReverse(VecLoad(off, (const byte*)src)); # endif #endif // _ARCH_PWR7 } @@ -264,16 +264,16 @@ inline uint32x4_p VectorLoadBE(int off, const byte src[16]) /// \param data the vector /// \param dest the byte array /// \details Stores a vector in native endian format to a byte array. -/// \details VectorStore_ALTIVEC() uses vec_st if the effective address +/// \details VecStore_ALTIVEC() uses vec_st if the effective address /// of dest is aligned, and uses vec_ste otherwise. /// vec_ste is relatively expensive so you should provide aligned /// memory adresses. -/// \details VectorStore_ALTIVEC() is used automatically when POWER7 or above +/// \details VecStore_ALTIVEC() is used automatically when POWER7 or above /// and unaligned loads is not available. -/// \note VectorStore does not require an aligned array. +/// \note VecStore does not require an aligned array. /// \since Crypto++ 8.0 template -inline void VectorStore_ALTIVEC(const T data, byte dest[16]) +inline void VecStore_ALTIVEC(const T data, byte dest[16]) { if (IsAlignedOn(dest, 16)) { @@ -300,16 +300,16 @@ inline void VectorStore_ALTIVEC(const T data, byte dest[16]) /// \param off the byte offset into the array /// \param dest the byte array /// \details Stores a vector in native endian format to a byte array. -/// \details VectorStore_ALTIVEC() uses vec_st if the effective address +/// \details VecStore_ALTIVEC() uses vec_st if the effective address /// of dest is aligned, and uses vec_ste otherwise. /// vec_ste is relatively expensive so you should provide aligned /// memory adresses. -/// \details VectorStore_ALTIVEC() is used automatically when POWER7 or above +/// \details VecStore_ALTIVEC() is used automatically when POWER7 or above /// and unaligned loads is not available. -/// \note VectorStore does not require an aligned array. +/// \note VecStore does not require an aligned array. /// \since Crypto++ 8.0 template -inline void VectorStore_ALTIVEC(const T data, int off, byte dest[16]) +inline void VecStore_ALTIVEC(const T data, int off, byte dest[16]) { if (IsAlignedOn(dest, 16)) { @@ -335,15 +335,15 @@ inline void VectorStore_ALTIVEC(const T data, int off, byte dest[16]) /// \param data the vector /// \param dest the byte array /// \details Stores a vector in native endian format to a byte array. -/// \details VectorStore uses POWER7's vec_xst or +/// \details VecStore uses POWER7's vec_xst or /// vec_vsx_st if available. The instructions do not require /// an aligned memory address. -/// \details VectorStore_ALTIVEC() is used if POWER7 or above -/// is not available. VectorStore_ALTIVEC() is relatively expensive. -/// \note VectorStore does not require an aligned array. +/// \details VecStore_ALTIVEC() is used if POWER7 or above +/// is not available. VecStore_ALTIVEC() is relatively expensive. +/// \note VecStore does not require an aligned array. /// \since Crypto++ 6.0 template -inline void VectorStore(const T data, byte dest[16]) +inline void VecStore(const T data, byte dest[16]) { #if defined(_ARCH_PWR7) # if defined(__xlc__) || defined(__xlC__) || defined(__clang__) @@ -352,7 +352,7 @@ inline void VectorStore(const T data, byte dest[16]) vec_vsx_st((uint8x16_p)data, 0, (byte*)dest); # endif #else - return VectorStore_ALTIVEC(data, 0, dest); + return VecStore_ALTIVEC(data, 0, dest); #endif } @@ -362,15 +362,15 @@ inline void VectorStore(const T data, byte dest[16]) /// \param off the byte offset into the array /// \param dest the byte array /// \details Stores a vector in native endian format to a byte array. -/// \details VectorStore uses POWER7's vec_xst or +/// \details VecStore uses POWER7's vec_xst or /// vec_vsx_st if available. The instructions do not require /// an aligned memory address. -/// \details VectorStore_ALTIVEC() is used if POWER7 or above -/// is not available. VectorStore_ALTIVEC() is relatively expensive. -/// \note VectorStore does not require an aligned array. +/// \details VecStore_ALTIVEC() is used if POWER7 or above +/// is not available. VecStore_ALTIVEC() is relatively expensive. +/// \note VecStore does not require an aligned array. /// \since Crypto++ 6.0 template -inline void VectorStore(const T data, int off, byte dest[16]) +inline void VecStore(const T data, int off, byte dest[16]) { #if defined(_ARCH_PWR7) # if defined(__xlc__) || defined(__xlC__) || defined(__clang__) @@ -379,7 +379,7 @@ inline void VectorStore(const T data, int off, byte dest[16]) vec_vsx_st((uint8x16_p)data, off, (byte*)dest); # endif #else - return VectorStore_ALTIVEC(data, off, dest); + return VecStore_ALTIVEC(data, off, dest); #endif } @@ -388,17 +388,17 @@ inline void VectorStore(const T data, int off, byte dest[16]) /// \param data the vector /// \param dest the byte array /// \details Stores a vector in native endian format to a byte array. -/// \details VectorStore uses POWER7's vec_xst or +/// \details VecStore uses POWER7's vec_xst or /// vec_vsx_st if available. The instructions do not require /// an aligned memory address. -/// \details VectorStore_ALTIVEC() is used if POWER7 or above -/// is not available. VectorStore_ALTIVEC() is relatively expensive. -/// \note VectorStore does not require an aligned array. +/// \details VecStore_ALTIVEC() is used if POWER7 or above +/// is not available. VecStore_ALTIVEC() is relatively expensive. +/// \note VecStore does not require an aligned array. /// \since Crypto++ 8.0 template -inline void VectorStore(const T data, word32 dest[4]) +inline void VecStore(const T data, word32 dest[4]) { - VectorStore((uint8x16_p)data, 0, (byte*)dest); + VecStore((uint8x16_p)data, 0, (byte*)dest); } /// \brief Stores a vector to a word array @@ -407,17 +407,17 @@ inline void VectorStore(const T data, word32 dest[4]) /// \param off the byte offset into the array /// \param dest the byte array /// \details Stores a vector in native endian format to a byte array. -/// \details VectorStore uses POWER7's vec_xst or +/// \details VecStore uses POWER7's vec_xst or /// vec_vsx_st if available. The instructions do not require /// an aligned memory address. -/// \details VectorStore_ALTIVEC() is used if POWER7 or above -/// is not available. VectorStore_ALTIVEC() is relatively expensive. -/// \note VectorStore does not require an aligned array. +/// \details VecStore_ALTIVEC() is used if POWER7 or above +/// is not available. VecStore_ALTIVEC() is relatively expensive. +/// \note VecStore does not require an aligned array. /// \since Crypto++ 8.0 template -inline void VectorStore(const T data, int off, word32 dest[4]) +inline void VecStore(const T data, int off, word32 dest[4]) { - VectorStore((uint8x16_p)data, off, (byte*)dest); + VecStore((uint8x16_p)data, off, (byte*)dest); } /// \brief Stores a vector to a byte array @@ -425,16 +425,16 @@ inline void VectorStore(const T data, int off, word32 dest[4]) /// \param src the vector /// \param dest the byte array /// \details Stores a vector in big endian format to a byte array. -/// VectorStoreBE will swap all bytes on little endian systems. -/// \details VectorStoreBE uses POWER7's vec_xst or +/// VecStoreBE will swap all bytes on little endian systems. +/// \details VecStoreBE uses POWER7's vec_xst or /// vec_vsx_st if available. The instructions do not require /// an aligned memory address. -/// \details VectorStore_ALTIVEC() is used if POWER7 or above -/// is not available. VectorStore_ALTIVEC() is relatively expensive. -/// \note VectorStoreBE does not require an aligned array. +/// \details VecStore_ALTIVEC() is used if POWER7 or above +/// is not available. VecStore_ALTIVEC() is relatively expensive. +/// \note VecStoreBE does not require an aligned array. /// \since Crypto++ 6.0 template -inline void VectorStoreBE(const T src, byte dest[16]) +inline void VecStoreBE(const T src, byte dest[16]) { #if defined(_ARCH_PWR7) # if defined(__xlc__) || defined(__xlC__) || defined(__clang__) @@ -443,14 +443,14 @@ inline void VectorStoreBE(const T src, byte dest[16]) # if (CRYPTOPP_BIG_ENDIAN) vec_vsx_st((uint8x16_p)src, 0, (byte*)dest); # else - vec_vsx_st((uint8x16_p)Reverse(src), 0, (byte*)dest); + vec_vsx_st((uint8x16_p)VecReverse(src), 0, (byte*)dest); # endif # endif #else // _ARCH_PWR7 # if (CRYPTOPP_BIG_ENDIAN) - VectorStore((uint8x16_p)src, (byte*)dest); + VecStore((uint8x16_p)src, (byte*)dest); # else - VectorStore((uint8x16_p)Reverse(src), (byte*)dest); + VecStore((uint8x16_p)VecReverse(src), (byte*)dest); # endif #endif // _ARCH_PWR7 } @@ -461,16 +461,16 @@ inline void VectorStoreBE(const T src, byte dest[16]) /// \param off offset into the dest byte array /// \param dest the byte array /// \details Stores a vector in big endian format to a byte array. -/// VectorStoreBE will swap all bytes on little endian systems. -/// \details VectorStoreBE uses POWER7's vec_xst or +/// VecStoreBE will swap all bytes on little endian systems. +/// \details VecStoreBE uses POWER7's vec_xst or /// vec_vsx_st if available. The instructions do not require /// an aligned memory address. -/// \details VectorStore_ALTIVEC() is used if POWER7 or above -/// is not available. VectorStore_ALTIVEC() is relatively expensive. -/// \note VectorStoreBE does not require an aligned array. +/// \details VecStore_ALTIVEC() is used if POWER7 or above +/// is not available. VecStore_ALTIVEC() is relatively expensive. +/// \note VecStoreBE does not require an aligned array. /// \since Crypto++ 6.0 template -inline void VectorStoreBE(const T src, int off, byte dest[16]) +inline void VecStoreBE(const T src, int off, byte dest[16]) { #if defined(_ARCH_PWR7) # if defined(__xlc__) || defined(__xlC__) || defined(__clang__) @@ -479,14 +479,14 @@ inline void VectorStoreBE(const T src, int off, byte dest[16]) # if (CRYPTOPP_BIG_ENDIAN) vec_vsx_st((uint8x16_p)src, off, (byte*)dest); # else - vec_vsx_st((uint8x16_p)Reverse(src), off, (byte*)dest); + vec_vsx_st((uint8x16_p)VecReverse(src), off, (byte*)dest); # endif # endif #else // _ARCH_PWR7 # if (CRYPTOPP_BIG_ENDIAN) - VectorStore((uint8x16_p)src, off, (byte*)dest); + VecStore((uint8x16_p)src, off, (byte*)dest); # else - VectorStore((uint8x16_p)Reverse(src), off, (byte*)dest); + VecStore((uint8x16_p)VecReverse(src), off, (byte*)dest); # endif #endif // _ARCH_PWR7 } @@ -498,12 +498,12 @@ inline void VectorStoreBE(const T src, int off, byte dest[16]) /// \param vec the vector /// \param mask vector mask /// \returns vector -/// \details VectorPermute returns a new vector from vec based on +/// \details VecPermute returns a new vector from vec based on /// mask. mask is an uint8x16_p type vector. The return /// vector is the same type as vec. /// \since Crypto++ 6.0 template -inline T1 VectorPermute(const T1 vec, const T2 mask) +inline T1 VecPermute(const T1 vec, const T2 mask) { return (T1)vec_perm(vec, vec, (uint8x16_p)mask); } @@ -515,12 +515,12 @@ inline T1 VectorPermute(const T1 vec, const T2 mask) /// \param vec2 the second vector /// \param mask vector mask /// \returns vector -/// \details VectorPermute returns a new vector from vec1 and vec2 +/// \details VecPermute returns a new vector from vec1 and vec2 /// based on mask. mask is an uint8x16_p type vector. The return /// vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorPermute(const T1 vec1, const T1 vec2, const T2 mask) +inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask) { return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask); } @@ -531,11 +531,11 @@ inline T1 VectorPermute(const T1 vec1, const T1 vec2, const T2 mask) /// \param vec1 the first vector /// \param vec2 the second vector /// \returns vector -/// \details VectorAnd returns a new vector from vec1 and vec2. The return +/// \details VecAnd returns a new vector from vec1 and vec2. The return /// vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorAnd(const T1 vec1, const T2 vec2) +inline T1 VecAnd(const T1 vec1, const T2 vec2) { return (T1)vec_and(vec1, (T1)vec2); } @@ -546,11 +546,11 @@ inline T1 VectorAnd(const T1 vec1, const T2 vec2) /// \param vec1 the first vector /// \param vec2 the second vector /// \returns vector -/// \details VectorOr returns a new vector from vec1 and vec2. The return +/// \details VecOr returns a new vector from vec1 and vec2. The return /// vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorOr(const T1 vec1, const T2 vec2) +inline T1 VecOr(const T1 vec1, const T2 vec2) { return (T1)vec_or(vec1, (T1)vec2); } @@ -561,11 +561,11 @@ inline T1 VectorOr(const T1 vec1, const T2 vec2) /// \param vec1 the first vector /// \param vec2 the second vector /// \returns vector -/// \details VectorXor returns a new vector from vec1 and vec2. The return +/// \details VecXor returns a new vector from vec1 and vec2. The return /// vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorXor(const T1 vec1, const T2 vec2) +inline T1 VecXor(const T1 vec1, const T2 vec2) { return (T1)vec_xor(vec1, (T1)vec2); } @@ -576,12 +576,12 @@ inline T1 VectorXor(const T1 vec1, const T2 vec2) /// \param vec1 the first vector /// \param vec2 the second vector /// \returns vector -/// \details VectorAdd returns a new vector from vec1 and vec2. +/// \details VecAdd returns a new vector from vec1 and vec2. /// vec2 is cast to the same type as vec1. The return vector /// is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorAdd(const T1 vec1, const T2 vec2) +inline T1 VecAdd(const T1 vec1, const T2 vec2) { return (T1)vec_add(vec1, (T1)vec2); } @@ -591,12 +591,12 @@ inline T1 VectorAdd(const T1 vec1, const T2 vec2) /// \tparam T2 vector type /// \param vec1 the first vector /// \param vec2 the second vector -/// \details VectorSub returns a new vector from vec1 and vec2. +/// \details VecSub returns a new vector from vec1 and vec2. /// vec2 is cast to the same type as vec1. The return vector /// is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorSub(const T1 vec1, const T2 vec2) +inline T1 VecSub(const T1 vec1, const T2 vec2) { return (T1)vec_sub(vec1, (T1)vec2); } @@ -607,10 +607,10 @@ inline T1 VectorSub(const T1 vec1, const T2 vec2) /// \param vec1 the first vector /// \param vec2 the second vector /// \returns vector -/// \details VectorAdd64 returns a new vector from vec1 and vec2. +/// \details VecAdd64 returns a new vector from vec1 and vec2. /// vec1 and vec2 are added as uint64x2_p quantities. /// \since Crypto++ 8.0 -inline uint32x4_p VectorAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2) +inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2) { #if defined(_ARCH_PWR8) return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2); @@ -632,22 +632,22 @@ inline uint32x4_p VectorAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2) /// \tparam T vector type /// \param vec the vector /// \returns vector -/// \details VectorShiftLeftOctet() returns a new vector after shifting the +/// \details VecShiftLeftOctet() returns a new vector after shifting the /// concatenation of the zero vector and the source vector by the specified /// number of bytes. The return vector is the same type as vec. -/// \details On big endian machines VectorShiftLeftOctet() is vec_sld(a, z, -/// c). On little endian machines VectorShiftLeftOctet() is translated to +/// \details On big endian machines VecShiftLeftOctet() is vec_sld(a, z, +/// c). On little endian machines VecShiftLeftOctet() is translated to /// vec_sld(z, a, 16-c). You should always call the function as /// if on a big endian machine as shown below. ///
-///    uint8x16_p x = VectorLoad(ptr);
-///    uint8x16_p y = VectorShiftLeftOctet<12>(x);
+///    uint8x16_p x = VecLoad(ptr);
+///    uint8x16_p y = VecShiftLeftOctet<12>(x);
 /// 
/// \sa Is vec_sld /// endian sensitive? on Stack Overflow /// \since Crypto++ 6.0 template -inline T VectorShiftLeftOctet(const T vec) +inline T VecShiftLeftOctet(const T vec) { const T zero = {0}; if (C >= 16) @@ -675,22 +675,22 @@ inline T VectorShiftLeftOctet(const T vec) /// \tparam T vector type /// \param vec the vector /// \returns vector -/// \details VectorShiftRightOctet() returns a new vector after shifting the +/// \details VecShiftRightOctet() returns a new vector after shifting the /// concatenation of the zero vector and the source vector by the specified /// number of bytes. The return vector is the same type as vec. -/// \details On big endian machines VectorShiftRightOctet() is vec_sld(a, z, -/// c). On little endian machines VectorShiftRightOctet() is translated to +/// \details On big endian machines VecShiftRightOctet() is vec_sld(a, z, +/// c). On little endian machines VecShiftRightOctet() is translated to /// vec_sld(z, a, 16-c). You should always call the function as /// if on a big endian machine as shown below. ///
-///    uint8x16_p x = VectorLoad(ptr);
-///    uint8x16_p y = VectorShiftRightOctet<12>(y);
+///    uint8x16_p x = VecLoad(ptr);
+///    uint8x16_p y = VecShiftRightOctet<12>(y);
 /// 
/// \sa Is vec_sld /// endian sensitive? on Stack Overflow /// \since Crypto++ 6.0 template -inline T VectorShiftRightOctet(const T vec) +inline T VecShiftRightOctet(const T vec) { const T zero = {0}; if (C >= 16) @@ -718,14 +718,14 @@ inline T VectorShiftRightOctet(const T vec) /// \tparam T vector type /// \param vec the vector /// \returns vector -/// \details VectorRotateLeftOctet() returns a new vector after rotating the +/// \details VecRotateLeftOctet() returns a new vector after rotating the /// concatenation of the source vector with itself by the specified /// number of bytes. The return vector is the same type as vec. /// \sa Is vec_sld /// endian sensitive? on Stack Overflow /// \since Crypto++ 6.0 template -inline T VectorRotateLeftOctet(const T vec) +inline T VecRotateLeftOctet(const T vec) { enum { R = C&0xf }; #if (CRYPTOPP_BIG_ENDIAN) @@ -740,14 +740,14 @@ inline T VectorRotateLeftOctet(const T vec) /// \tparam T vector type /// \param vec the vector /// \returns vector -/// \details VectorRotateRightOctet() returns a new vector after rotating the +/// \details VecRotateRightOctet() returns a new vector after rotating the /// concatenation of the source vector with itself by the specified /// number of bytes. The return vector is the same type as vec. /// \sa Is vec_sld /// endian sensitive? on Stack Overflow /// \since Crypto++ 6.0 template -inline T VectorRotateRightOctet(const T vec) +inline T VecRotateRightOctet(const T vec) { enum { R = C&0xf }; #if (CRYPTOPP_BIG_ENDIAN) @@ -761,9 +761,9 @@ inline T VectorRotateRightOctet(const T vec) /// \tparam C shift bit count /// \param vec the vector /// \returns vector -/// \details VectorRotateLeft rotates each element in a packed vector by bit count. +/// \details VecRotateLeft rotates each element in a packed vector by bit count. template -inline uint32x4_p VectorRotateLeft(const uint32x4_p vec) +inline uint32x4_p VecRotateLeft(const uint32x4_p vec) { const uint32x4_p m = {C, C, C, C}; return vec_rl(vec, m); @@ -773,9 +773,9 @@ inline uint32x4_p VectorRotateLeft(const uint32x4_p vec) /// \tparam C shift bit count /// \param vec the vector /// \returns vector -/// \details VectorRotateRight rotates each element in a packed vector by bit count. +/// \details VecRotateRight rotates each element in a packed vector by bit count. template -inline uint32x4_p VectorRotateRight(const uint32x4_p vec) +inline uint32x4_p VecRotateRight(const uint32x4_p vec) { const uint32x4_p m = {32-C, 32-C, 32-C, 32-C}; return vec_rl(vec, m); @@ -787,7 +787,7 @@ inline uint32x4_p VectorRotateRight(const uint32x4_p vec) /// \returns vector /// \since Crypto++ 7.0 template -inline T VectorSwapWords(const T vec) +inline T VecSwapWords(const T vec) { return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8); } @@ -796,34 +796,34 @@ inline T VectorSwapWords(const T vec) /// \tparam T vector type /// \param val the vector /// \returns vector created from low dword -/// \details VectorGetLow() extracts the low dword from a vector. The low dword +/// \details VecGetLow() extracts the low dword from a vector. The low dword /// is composed of the least significant bits and occupies bytes 8 through 15 /// when viewed as a big endian array. The return vector is the same type as /// the original vector and padded with 0's in the most significant bit positions. template -inline T VectorGetLow(const T val) +inline T VecGetLow(const T val) { //const T zero = {0}; //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 }; //return (T)vec_perm(zero, val, mask); - return VectorShiftRightOctet<8>(VectorShiftLeftOctet<8>(val)); + return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val)); } /// \brief Extract a dword from a vector /// \tparam T vector type /// \param val the vector /// \returns vector created from high dword -/// \details VectorGetHigh() extracts the high dword from a vector. The high dword +/// \details VecGetHigh() extracts the high dword from a vector. The high dword /// is composed of the most significant bits and occupies bytes 0 through 7 /// when viewed as a big endian array. The return vector is the same type as /// the original vector and padded with 0's in the most significant bit positions. template -inline T VectorGetHigh(const T val) +inline T VecGetHigh(const T val) { //const T zero = {0}; //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 }; //return (T)vec_perm(zero, val, mask); - return VectorShiftRightOctet<8>(val); + return VecShiftRightOctet<8>(val); } /// \brief Compare two vectors @@ -833,7 +833,7 @@ inline T VectorGetHigh(const T val) /// \param vec2 the second vector /// \returns true if vec1 equals vec2, false otherwise template -inline bool VectorEqual(const T1 vec1, const T2 vec2) +inline bool VecEqual(const T1 vec1, const T2 vec2) { return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2); } @@ -845,7 +845,7 @@ inline bool VectorEqual(const T1 vec1, const T2 vec2) /// \param vec2 the second vector /// \returns true if vec1 does not equal vec2, false otherwise template -inline bool VectorNotEqual(const T1 vec1, const T2 vec2) +inline bool VecNotEqual(const T1 vec1, const T2 vec2) { return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2); } @@ -859,11 +859,11 @@ inline bool VectorNotEqual(const T1 vec1, const T2 vec2) /// \tparam T2 vector type /// \param state the state vector /// \param key the subkey vector -/// \details VectorEncrypt performs one round of AES encryption of state +/// \details VecEncrypt performs one round of AES encryption of state /// using subkey key. The return vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorEncrypt(const T1 state, const T2 key) +inline T1 VecEncrypt(const T1 state, const T2 key) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key); @@ -879,11 +879,11 @@ inline T1 VectorEncrypt(const T1 state, const T2 key) /// \tparam T2 vector type /// \param state the state vector /// \param key the subkey vector -/// \details VectorEncryptLast performs the final round of AES encryption +/// \details VecEncryptLast performs the final round of AES encryption /// of state using subkey key. The return vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorEncryptLast(const T1 state, const T2 key) +inline T1 VecEncryptLast(const T1 state, const T2 key) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key); @@ -899,11 +899,11 @@ inline T1 VectorEncryptLast(const T1 state, const T2 key) /// \tparam T2 vector type /// \param state the state vector /// \param key the subkey vector -/// \details VectorDecrypt performs one round of AES decryption of state +/// \details VecDecrypt performs one round of AES decryption of state /// using subkey key. The return vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorDecrypt(const T1 state, const T2 key) +inline T1 VecDecrypt(const T1 state, const T2 key) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key); @@ -919,11 +919,11 @@ inline T1 VectorDecrypt(const T1 state, const T2 key) /// \tparam T2 vector type /// \param state the state vector /// \param key the subkey vector -/// \details VectorDecryptLast performs the final round of AES decryption +/// \details VecDecryptLast performs the final round of AES decryption /// of state using subkey key. The return vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorDecryptLast(const T1 state, const T2 key) +inline T1 VecDecryptLast(const T1 state, const T2 key) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key); @@ -939,11 +939,11 @@ inline T1 VectorDecryptLast(const T1 state, const T2 key) /// \tparam subfunc sub-function /// \tparam T vector type /// \param vec the block to transform -/// \details VectorSHA256 selects sigma0, sigma1, Sigma0, Sigma1 based on +/// \details VecSHA256 selects sigma0, sigma1, Sigma0, Sigma1 based on /// func and subfunc. The return vector is the same type as vec. /// \since Crypto++ 6.0 template -inline T VectorSHA256(const T vec) +inline T VecSHA256(const T vec) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T)__vshasigmaw((uint32x4_p)vec, func, subfunc); @@ -959,11 +959,11 @@ inline T VectorSHA256(const T vec) /// \tparam subfunc sub-function /// \tparam T vector type /// \param vec the block to transform -/// \details VectorSHA512 selects sigma0, sigma1, Sigma0, Sigma1 based on +/// \details VecSHA512 selects sigma0, sigma1, Sigma0, Sigma1 based on /// func and subfunc. The return vector is the same type as vec. /// \since Crypto++ 6.0 template -inline T VectorSHA512(const T vec) +inline T VecSHA512(const T vec) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T)__vshasigmad((uint64x2_p)vec, func, subfunc); diff --git a/rijndael_simd.cpp b/rijndael_simd.cpp index 3d122d70..f4c9a288 100644 --- a/rijndael_simd.cpp +++ b/rijndael_simd.cpp @@ -214,12 +214,12 @@ bool CPU_ProbePower8() #if defined(__xlc__) || defined(__xlC__) const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1); const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2); - const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add + const uint64x2_p v3 = VecAdd(v1, v2); // 64-bit add vec_xst((uint8x16_p)v3, 0, (byte*)w3); #else const uint64x2_p v1 = (uint64x2_p)vec_vsx_ld(0, (byte*)w1); const uint64x2_p v2 = (uint64x2_p)vec_vsx_ld(0, (byte*)w2); - const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add + const uint64x2_p v3 = VecAdd(v1, v2); // 64-bit add vec_vsx_st((uint8x16_p)v3, 0, (byte*)w3); #endif @@ -265,13 +265,13 @@ bool CPU_ProbeAES() 0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08}; byte r[16] = {255}, z[16] = {}; - uint8x16_p k = (uint8x16_p)VectorLoad(0, key); - uint8x16_p s = (uint8x16_p)VectorLoad(0, state); - s = VectorEncrypt(s, k); - s = VectorEncryptLast(s, k); - s = VectorDecrypt(s, k); - s = VectorDecryptLast(s, k); - VectorStore(s, r); + uint8x16_p k = (uint8x16_p)VecLoad(0, key); + uint8x16_p s = (uint8x16_p)VecLoad(0, state); + s = VecEncrypt(s, k); + s = VecEncryptLast(s, k); + s = VecDecrypt(s, k); + s = VecDecryptLast(s, k); + VecStore(s, r); result = (0 != std::memcmp(r, z, 16)); } @@ -697,17 +697,17 @@ static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, un CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); - uint32x4_p k = VectorLoad(keys); - block = VectorXor(block, k); + uint32x4_p k = VecLoad(keys); + block = VecXor(block, k); for (size_t i=1; i(subkeys); - uint32x4_p k = VectorLoad(keys); - block0 = VectorXor(block0, k); - block1 = VectorXor(block1, k); - block2 = VectorXor(block2, k); - block3 = VectorXor(block3, k); - block4 = VectorXor(block4, k); - block5 = VectorXor(block5, k); + uint32x4_p k = VecLoad(keys); + block0 = VecXor(block0, k); + block1 = VecXor(block1, k); + block2 = VecXor(block2, k); + block3 = VecXor(block3, k); + block4 = VecXor(block4, k); + block5 = VecXor(block5, k); for (size_t i=1; i(subkeys); - uint32x4_p k = VectorLoad(rounds*16, keys); - block = VectorXor(block, k); + uint32x4_p k = VecLoad(rounds*16, keys); + block = VecXor(block, k); for (size_t i=rounds-1; i>1; i-=2) { - block = VectorDecrypt(block, VectorLoad( i*16, keys)); - block = VectorDecrypt(block, VectorLoad((i-1)*16, keys)); + block = VecDecrypt(block, VecLoad( i*16, keys)); + block = VecDecrypt(block, VecLoad((i-1)*16, keys)); } - block = VectorDecrypt(block, VectorLoad(16, keys)); - block = VectorDecryptLast(block, VectorLoad(0, keys)); + block = VecDecrypt(block, VecLoad(16, keys)); + block = VecDecryptLast(block, VecLoad(0, keys)); } static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -770,32 +770,32 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); - uint32x4_p k = VectorLoad(rounds*16, keys); - block0 = VectorXor(block0, k); - block1 = VectorXor(block1, k); - block2 = VectorXor(block2, k); - block3 = VectorXor(block3, k); - block4 = VectorXor(block4, k); - block5 = VectorXor(block5, k); + uint32x4_p k = VecLoad(rounds*16, keys); + block0 = VecXor(block0, k); + block1 = VecXor(block1, k); + block2 = VecXor(block2, k); + block3 = VecXor(block3, k); + block4 = VecXor(block4, k); + block5 = VecXor(block5, k); for (size_t i=rounds-1; i>0; --i) { - k = VectorLoad(i*16, keys); - block0 = VectorDecrypt(block0, k); - block1 = VectorDecrypt(block1, k); - block2 = VectorDecrypt(block2, k); - block3 = VectorDecrypt(block3, k); - block4 = VectorDecrypt(block4, k); - block5 = VectorDecrypt(block5, k); + k = VecLoad(i*16, keys); + block0 = VecDecrypt(block0, k); + block1 = VecDecrypt(block1, k); + block2 = VecDecrypt(block2, k); + block3 = VecDecrypt(block3, k); + block4 = VecDecrypt(block4, k); + block5 = VecDecrypt(block5, k); } - k = VectorLoad(0, keys); - block0 = VectorDecryptLast(block0, k); - block1 = VectorDecryptLast(block1, k); - block2 = VectorDecryptLast(block2, k); - block3 = VectorDecryptLast(block3, k); - block4 = VectorDecryptLast(block4, k); - block5 = VectorDecryptLast(block5, k); + k = VecLoad(0, keys); + block0 = VecDecryptLast(block0, k); + block1 = VecDecryptLast(block1, k); + block2 = VecDecryptLast(block2, k); + block3 = VecDecryptLast(block3, k); + block4 = VecDecryptLast(block4, k); + block5 = VecDecryptLast(block5, k); } ANONYMOUS_NAMESPACE_END @@ -851,14 +851,14 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32* { const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rkey); const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rkey); - vec_vsx_st(vec_perm(d1, zero, mask), 0, (uint8_t*)rkey); - vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rkey); + vec_vsx_st(VecPermute(d1, zero, mask), 0, (uint8_t*)rkey); + vec_vsx_st(VecPermute(d2, zero, mask), 16, (uint8_t*)rkey); } for ( ; i(x); - x = VectorSHA256<0,1>(x); - x = VectorSHA256<1,0>(x); - x = VectorSHA256<1,1>(x); - VectorStore(x, r); + x = VecSHA256<0,0>(x); + x = VecSHA256<0,1>(x); + x = VecSHA256<1,0>(x); + x = VecSHA256<1,1>(x); + VecStore(x, r); result = (0 == std::memcmp(r, z, 16)); } @@ -268,11 +268,11 @@ bool CPU_ProbeSHA512() byte r[16], z[16] = {0}; uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); - x = VectorSHA512<0,0>(x); - x = VectorSHA512<0,1>(x); - x = VectorSHA512<1,0>(x); - x = VectorSHA512<1,1>(x); - VectorStore(x, r); + x = VecSHA512<0,0>(x); + x = VecSHA512<0,1>(x); + x = VecSHA512<1,0>(x); + x = VecSHA512<1,1>(x); + VecStore(x, r); result = (0 == std::memcmp(r, z, 16)); } @@ -1091,7 +1091,7 @@ typedef __vector unsigned long long uint64x2_p8; // Unaligned load template static inline -uint32x4_p8 VectorLoad32x4u(const T* data, int offset) +uint32x4_p8 VecLoad32x4u(const T* data, int offset) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (uint32x4_p8)vec_xl(offset, (uint8_t*)data); @@ -1102,7 +1102,7 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset) // Unaligned store template static inline -void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset) +void VecStore32x4u(const uint32x4_p8 val, T* data, int offset) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) vec_xst((uint8x16_p8)val, offset, (uint8_t*)data); @@ -1114,14 +1114,14 @@ void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset) // Unaligned load of a user message. The load is big-endian, // and then the message is permuted for 32-bit words. template static inline -uint32x4_p8 VectorLoadMsg32x4(const T* data, int offset) +uint32x4_p8 VecLoadMsg32x4(const T* data, int offset) { #if (CRYPTOPP_LITTLE_ENDIAN) const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; - const uint32x4_p8 r = VectorLoad32x4u(data, offset); - return (uint32x4_p8)vec_perm(r, r, mask); + const uint32x4_p8 r = VecLoad32x4u(data, offset); + return (uint32x4_p8)VecPermute(r, r, mask); #else - return VectorLoad32x4u(data, offset); + return VecLoad32x4u(data, offset); #endif } @@ -1136,7 +1136,7 @@ static inline uint32x4_p8 VectorMaj(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p8 z) { // The trick below is due to Andy Polyakov and Jack Lloyd - return vec_sel(y, z, vec_xor(x, y)); + return vec_sel(y, z, VecXor(x, y)); } static inline @@ -1185,7 +1185,7 @@ uint32x4_p8 VectorPack(const uint32x4_p8 a, const uint32x4_p8 b, { const uint8x16_p8 m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0}; const uint8x16_p8 m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; - return vec_perm(vec_perm(a,b,m1), vec_perm(c,d,m1), m2); + return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2); } template static inline @@ -1231,8 +1231,8 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t const uint32_t* k = reinterpret_cast(SHA256_K); const uint32_t* m = reinterpret_cast(data); - uint32x4_p8 abcd = VectorLoad32x4u(state+0, 0); - uint32x4_p8 efgh = VectorLoad32x4u(state+4, 0); + uint32x4_p8 abcd = VecLoad32x4u(state+0, 0); + uint32x4_p8 efgh = VecLoad32x4u(state+4, 0); uint32x4_p8 W[16], S[8], vm, vk; size_t blocks = length / SHA256::BLOCKSIZE; @@ -1241,80 +1241,80 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t unsigned int offset=0; S[A] = abcd; S[E] = efgh; - S[B] = VectorShiftLeftOctet<4>(S[A]); - S[F] = VectorShiftLeftOctet<4>(S[E]); - S[C] = VectorShiftLeftOctet<4>(S[B]); - S[G] = VectorShiftLeftOctet<4>(S[F]); - S[D] = VectorShiftLeftOctet<4>(S[C]); - S[H] = VectorShiftLeftOctet<4>(S[G]); + S[B] = VecShiftLeftOctet<4>(S[A]); + S[F] = VecShiftLeftOctet<4>(S[E]); + S[C] = VecShiftLeftOctet<4>(S[B]); + S[G] = VecShiftLeftOctet<4>(S[F]); + S[D] = VecShiftLeftOctet<4>(S[C]); + S[H] = VecShiftLeftOctet<4>(S[G]); // Rounds 0-16 - vk = VectorLoad32x4u(k, offset); - vm = VectorLoadMsg32x4(m, offset); + vk = VecLoad32x4u(k, offset); + vm = VecLoadMsg32x4(m, offset); SHA256_ROUND1<0>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<1>(W,S, vk,vm); - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<2>(W,S, vk,vm); - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<3>(W,S, vk,vm); - vk = VectorLoad32x4u(k, offset); - vm = VectorLoadMsg32x4(m, offset); + vk = VecLoad32x4u(k, offset); + vm = VecLoadMsg32x4(m, offset); SHA256_ROUND1<4>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<5>(W,S, vk,vm); - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<6>(W,S, vk,vm); - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<7>(W,S, vk,vm); - vk = VectorLoad32x4u(k, offset); - vm = VectorLoadMsg32x4(m, offset); + vk = VecLoad32x4u(k, offset); + vm = VecLoadMsg32x4(m, offset); SHA256_ROUND1<8>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<9>(W,S, vk,vm); - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<10>(W,S, vk,vm); - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<11>(W,S, vk,vm); - vk = VectorLoad32x4u(k, offset); - vm = VectorLoadMsg32x4(m, offset); + vk = VecLoad32x4u(k, offset); + vm = VecLoadMsg32x4(m, offset); SHA256_ROUND1<12>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<13>(W,S, vk,vm); - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<14>(W,S, vk,vm); - vk = VectorShiftLeftOctet<4>(vk); - vm = VectorShiftLeftOctet<4>(vm); + vk = VecShiftLeftOctet<4>(vk); + vm = VecShiftLeftOctet<4>(vm); SHA256_ROUND1<15>(W,S, vk,vm); m += 16; // 32-bit words, not bytes @@ -1322,32 +1322,32 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t // Rounds 16-64 for (unsigned int i=16; i<64; i+=16) { - vk = VectorLoad32x4u(k, offset); + vk = VecLoad32x4u(k, offset); SHA256_ROUND2<0>(W,S, vk); - SHA256_ROUND2<1>(W,S, VectorShiftLeftOctet<4>(vk)); - SHA256_ROUND2<2>(W,S, VectorShiftLeftOctet<8>(vk)); - SHA256_ROUND2<3>(W,S, VectorShiftLeftOctet<12>(vk)); + SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk)); + SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk)); + SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk)); offset+=16; - vk = VectorLoad32x4u(k, offset); + vk = VecLoad32x4u(k, offset); SHA256_ROUND2<4>(W,S, vk); - SHA256_ROUND2<5>(W,S, VectorShiftLeftOctet<4>(vk)); - SHA256_ROUND2<6>(W,S, VectorShiftLeftOctet<8>(vk)); - SHA256_ROUND2<7>(W,S, VectorShiftLeftOctet<12>(vk)); + SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk)); + SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk)); + SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk)); offset+=16; - vk = VectorLoad32x4u(k, offset); + vk = VecLoad32x4u(k, offset); SHA256_ROUND2<8>(W,S, vk); - SHA256_ROUND2<9>(W,S, VectorShiftLeftOctet<4>(vk)); - SHA256_ROUND2<10>(W,S, VectorShiftLeftOctet<8>(vk)); - SHA256_ROUND2<11>(W,S, VectorShiftLeftOctet<12>(vk)); + SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk)); + SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk)); + SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk)); offset+=16; - vk = VectorLoad32x4u(k, offset); + vk = VecLoad32x4u(k, offset); SHA256_ROUND2<12>(W,S, vk); - SHA256_ROUND2<13>(W,S, VectorShiftLeftOctet<4>(vk)); - SHA256_ROUND2<14>(W,S, VectorShiftLeftOctet<8>(vk)); - SHA256_ROUND2<15>(W,S, VectorShiftLeftOctet<12>(vk)); + SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk)); + SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk)); + SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk)); offset+=16; } @@ -1355,19 +1355,19 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t efgh += VectorPack(S[E],S[F],S[G],S[H]); } - VectorStore32x4u(abcd, state+0, 0); - VectorStore32x4u(efgh, state+4, 0); + VecStore32x4u(abcd, state+0, 0); + VecStore32x4u(efgh, state+4, 0); } static inline -uint64x2_p8 VectorPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask) +uint64x2_p8 VecPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask) { - return (uint64x2_p8)vec_perm(val, val, mask); + return (uint64x2_p8)VecPermute(val, val, mask); } // Unaligned load template static inline -uint64x2_p8 VectorLoad64x2u(const T* data, int offset) +uint64x2_p8 VecLoad64x2u(const T* data, int offset) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (uint64x2_p8)vec_xl(offset, (uint8_t*)data); @@ -1378,7 +1378,7 @@ uint64x2_p8 VectorLoad64x2u(const T* data, int offset) // Unaligned store template static inline -void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset) +void VecStore64x2u(const uint64x2_p8 val, T* data, int offset) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) vec_xst((uint8x16_p8)val, offset, (uint8_t*)data); @@ -1390,13 +1390,13 @@ void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset) // Unaligned load of a user message. The load is big-endian, // and then the message is permuted for 32-bit words. template static inline -uint64x2_p8 VectorLoadMsg64x2(const T* data, int offset) +uint64x2_p8 VecLoadMsg64x2(const T* data, int offset) { #if (CRYPTOPP_LITTLE_ENDIAN) const uint8x16_p8 mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}; - return VectorPermute64x2(VectorLoad64x2u(data, offset), mask); + return VecPermute64x2(VecLoad64x2u(data, offset), mask); #else - return VectorLoad64x2u(data, offset); + return VecLoad64x2u(data, offset); #endif } @@ -1411,7 +1411,7 @@ static inline uint64x2_p8 VectorMaj(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p8 z) { // The trick below is due to Andy Polyakov and Jack Lloyd - return vec_sel(y, z, vec_xor(x, y)); + return vec_sel(y, z, VecXor(x, y)); } static inline @@ -1458,7 +1458,7 @@ static inline uint64x2_p8 VectorPack(const uint64x2_p8 x, const uint64x2_p8 y) { const uint8x16_p8 m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; - return vec_perm(x,y,m); + return VecPermute(x,y,m); } template static inline @@ -1504,10 +1504,10 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t const uint64_t* k = reinterpret_cast(SHA512_K); const uint64_t* m = reinterpret_cast(data); - uint64x2_p8 ab = VectorLoad64x2u(state+0, 0); - uint64x2_p8 cd = VectorLoad64x2u(state+2, 0); - uint64x2_p8 ef = VectorLoad64x2u(state+4, 0); - uint64x2_p8 gh = VectorLoad64x2u(state+6, 0); + uint64x2_p8 ab = VecLoad64x2u(state+0, 0); + uint64x2_p8 cd = VecLoad64x2u(state+2, 0); + uint64x2_p8 ef = VecLoad64x2u(state+4, 0); + uint64x2_p8 gh = VecLoad64x2u(state+6, 0); uint64x2_p8 W[16], S[8], vm, vk; size_t blocks = length / SHA512::BLOCKSIZE; @@ -1517,82 +1517,82 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t S[A] = ab; S[C] = cd; S[E] = ef; S[G] = gh; - S[B] = VectorShiftLeftOctet<8>(S[A]); - S[D] = VectorShiftLeftOctet<8>(S[C]); - S[F] = VectorShiftLeftOctet<8>(S[E]); - S[H] = VectorShiftLeftOctet<8>(S[G]); + S[B] = VecShiftLeftOctet<8>(S[A]); + S[D] = VecShiftLeftOctet<8>(S[C]); + S[F] = VecShiftLeftOctet<8>(S[E]); + S[H] = VecShiftLeftOctet<8>(S[G]); // Rounds 0-16 - vk = VectorLoad64x2u(k, offset); - vm = VectorLoadMsg64x2(m, offset); + vk = VecLoad64x2u(k, offset); + vm = VecLoadMsg64x2(m, offset); SHA512_ROUND1<0>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<8>(vk); - vm = VectorShiftLeftOctet<8>(vm); + vk = VecShiftLeftOctet<8>(vk); + vm = VecShiftLeftOctet<8>(vm); SHA512_ROUND1<1>(W,S, vk,vm); - vk = VectorLoad64x2u(k, offset); - vm = VectorLoadMsg64x2(m, offset); + vk = VecLoad64x2u(k, offset); + vm = VecLoadMsg64x2(m, offset); SHA512_ROUND1<2>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<8>(vk); - vm = VectorShiftLeftOctet<8>(vm); + vk = VecShiftLeftOctet<8>(vk); + vm = VecShiftLeftOctet<8>(vm); SHA512_ROUND1<3>(W,S, vk,vm); - vk = VectorLoad64x2u(k, offset); - vm = VectorLoadMsg64x2(m, offset); + vk = VecLoad64x2u(k, offset); + vm = VecLoadMsg64x2(m, offset); SHA512_ROUND1<4>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<8>(vk); - vm = VectorShiftLeftOctet<8>(vm); + vk = VecShiftLeftOctet<8>(vk); + vm = VecShiftLeftOctet<8>(vm); SHA512_ROUND1<5>(W,S, vk,vm); - vk = VectorLoad64x2u(k, offset); - vm = VectorLoadMsg64x2(m, offset); + vk = VecLoad64x2u(k, offset); + vm = VecLoadMsg64x2(m, offset); SHA512_ROUND1<6>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<8>(vk); - vm = VectorShiftLeftOctet<8>(vm); + vk = VecShiftLeftOctet<8>(vk); + vm = VecShiftLeftOctet<8>(vm); SHA512_ROUND1<7>(W,S, vk,vm); - vk = VectorLoad64x2u(k, offset); - vm = VectorLoadMsg64x2(m, offset); + vk = VecLoad64x2u(k, offset); + vm = VecLoadMsg64x2(m, offset); SHA512_ROUND1<8>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<8>(vk); - vm = VectorShiftLeftOctet<8>(vm); + vk = VecShiftLeftOctet<8>(vk); + vm = VecShiftLeftOctet<8>(vm); SHA512_ROUND1<9>(W,S, vk,vm); - vk = VectorLoad64x2u(k, offset); - vm = VectorLoadMsg64x2(m, offset); + vk = VecLoad64x2u(k, offset); + vm = VecLoadMsg64x2(m, offset); SHA512_ROUND1<10>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<8>(vk); - vm = VectorShiftLeftOctet<8>(vm); + vk = VecShiftLeftOctet<8>(vk); + vm = VecShiftLeftOctet<8>(vm); SHA512_ROUND1<11>(W,S, vk,vm); - vk = VectorLoad64x2u(k, offset); - vm = VectorLoadMsg64x2(m, offset); + vk = VecLoad64x2u(k, offset); + vm = VecLoadMsg64x2(m, offset); SHA512_ROUND1<12>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<8>(vk); - vm = VectorShiftLeftOctet<8>(vm); + vk = VecShiftLeftOctet<8>(vk); + vm = VecShiftLeftOctet<8>(vm); SHA512_ROUND1<13>(W,S, vk,vm); - vk = VectorLoad64x2u(k, offset); - vm = VectorLoadMsg64x2(m, offset); + vk = VecLoad64x2u(k, offset); + vm = VecLoadMsg64x2(m, offset); SHA512_ROUND1<14>(W,S, vk,vm); offset+=16; - vk = VectorShiftLeftOctet<8>(vk); - vm = VectorShiftLeftOctet<8>(vm); + vk = VecShiftLeftOctet<8>(vk); + vm = VecShiftLeftOctet<8>(vm); SHA512_ROUND1<15>(W,S, vk,vm); m += 16; // 64-bit words, not bytes @@ -1600,44 +1600,44 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t // Rounds 16-80 for (unsigned int i=16; i<80; i+=16) { - vk = VectorLoad64x2u(k, offset); + vk = VecLoad64x2u(k, offset); SHA512_ROUND2<0>(W,S, vk); - SHA512_ROUND2<1>(W,S, VectorShiftLeftOctet<8>(vk)); + SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk)); offset+=16; - vk = VectorLoad64x2u(k, offset); + vk = VecLoad64x2u(k, offset); SHA512_ROUND2<2>(W,S, vk); - SHA512_ROUND2<3>(W,S, VectorShiftLeftOctet<8>(vk)); + SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk)); offset+=16; - vk = VectorLoad64x2u(k, offset); + vk = VecLoad64x2u(k, offset); SHA512_ROUND2<4>(W,S, vk); - SHA512_ROUND2<5>(W,S, VectorShiftLeftOctet<8>(vk)); + SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk)); offset+=16; - vk = VectorLoad64x2u(k, offset); + vk = VecLoad64x2u(k, offset); SHA512_ROUND2<6>(W,S, vk); - SHA512_ROUND2<7>(W,S, VectorShiftLeftOctet<8>(vk)); + SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk)); offset+=16; - vk = VectorLoad64x2u(k, offset); + vk = VecLoad64x2u(k, offset); SHA512_ROUND2<8>(W,S, vk); - SHA512_ROUND2<9>(W,S, VectorShiftLeftOctet<8>(vk)); + SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk)); offset+=16; - vk = VectorLoad64x2u(k, offset); + vk = VecLoad64x2u(k, offset); SHA512_ROUND2<10>(W,S, vk); - SHA512_ROUND2<11>(W,S, VectorShiftLeftOctet<8>(vk)); + SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk)); offset+=16; - vk = VectorLoad64x2u(k, offset); + vk = VecLoad64x2u(k, offset); SHA512_ROUND2<12>(W,S, vk); - SHA512_ROUND2<13>(W,S, VectorShiftLeftOctet<8>(vk)); + SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk)); offset+=16; - vk = VectorLoad64x2u(k, offset); + vk = VecLoad64x2u(k, offset); SHA512_ROUND2<14>(W,S, vk); - SHA512_ROUND2<15>(W,S, VectorShiftLeftOctet<8>(vk)); + SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk)); offset+=16; } @@ -1647,10 +1647,10 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t gh += VectorPack(S[G],S[H]); } - VectorStore64x2u(ab, state+0, 0); - VectorStore64x2u(cd, state+2, 0); - VectorStore64x2u(ef, state+4, 0); - VectorStore64x2u(gh, state+6, 0); + VecStore64x2u(ab, state+0, 0); + VecStore64x2u(cd, state+2, 0); + VecStore64x2u(ef, state+4, 0); + VecStore64x2u(gh, state+6, 0); } #endif // CRYPTOPP_POWER8_SHA_AVAILABLE diff --git a/simon128_simd.cpp b/simon128_simd.cpp index c1a80546..1277a234 100644 --- a/simon128_simd.cpp +++ b/simon128_simd.cpp @@ -548,8 +548,9 @@ using CryptoPP::uint8x16_p; using CryptoPP::uint32x4_p; using CryptoPP::uint64x2_p; -using CryptoPP::VectorAnd; -using CryptoPP::VectorXor; +using CryptoPP::VecAnd; +using CryptoPP::VecXor; +using CryptoPP::VecPermute; // Rotate left by bit count template @@ -569,8 +570,8 @@ CRYPTOPP_INLINE uint64x2_p RotateRight64(const uint64x2_p val) CRYPTOPP_INLINE uint64x2_p SIMON128_f(const uint64x2_p val) { - return VectorXor(RotateLeft64<2>(val), - VectorAnd(RotateLeft64<1>(val), RotateLeft64<8>(val))); + return VecXor(RotateLeft64<2>(val), + VecAnd(RotateLeft64<1>(val), RotateLeft64<8>(val))); } CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds) @@ -584,22 +585,22 @@ CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1); - uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2); + uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1); + uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2); for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) { const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]); - y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk1); + y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1); const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]); - x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk2); + x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2); } if (rounds & 1) { const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]); - y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk); + y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk); std::swap(x1, y1); } @@ -612,7 +613,7 @@ CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block = (uint32x4_p)vec_perm(x1, y1, m3); + block = (uint32x4_p)VecPermute(x1, y1, m3); } CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds) @@ -626,24 +627,24 @@ CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1); - uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2); + uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1); + uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2); if (rounds & 1) { std::swap(x1, y1); const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]); - y1 = VectorXor(VectorXor(y1, rk), SIMON128_f(x1)); + y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1)); rounds--; } for (int i = static_cast(rounds-2); i >= 0; i -= 2) { const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]); - x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk1); + x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1); const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]); - y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk2); + y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2); } #if (CRYPTOPP_BIG_ENDIAN) @@ -655,7 +656,7 @@ CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block = (uint32x4_p)vec_perm(x1, y1, m3); + block = (uint32x4_p)VecPermute(x1, y1, m3); } CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -671,32 +672,32 @@ CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1); - uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2); - uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1); - uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2); - uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1); - uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2); + uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1); + uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2); + uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1); + uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2); + uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1); + uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2); for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) { const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]); - y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk1); - y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk1); - y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk1); + y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1); + y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk1); + y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk1); const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]); - x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk2); - x2 = VectorXor(VectorXor(x2, SIMON128_f(y2)), rk2); - x3 = VectorXor(VectorXor(x3, SIMON128_f(y3)), rk2); + x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2); + x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk2); + x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk2); } if (rounds & 1) { const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]); - y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk); - y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk); - y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk); + y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk); + y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk); + y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk); std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); } @@ -709,12 +710,12 @@ CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); - block2 = (uint32x4_p)vec_perm(x2, y2, m3); - block3 = (uint32x4_p)vec_perm(x2, y2, m4); - block4 = (uint32x4_p)vec_perm(x3, y3, m3); - block5 = (uint32x4_p)vec_perm(x3, y3, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); + block2 = (uint32x4_p)VecPermute(x2, y2, m3); + block3 = (uint32x4_p)VecPermute(x2, y2, m4); + block4 = (uint32x4_p)VecPermute(x3, y3, m3); + block5 = (uint32x4_p)VecPermute(x3, y3, m4); } CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -730,34 +731,34 @@ CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1); - uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2); - uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1); - uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2); - uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1); - uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2); + uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1); + uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2); + uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1); + uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2); + uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1); + uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2); if (rounds & 1) { std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]); - y1 = VectorXor(VectorXor(y1, rk), SIMON128_f(x1)); - y2 = VectorXor(VectorXor(y2, rk), SIMON128_f(x2)); - y3 = VectorXor(VectorXor(y3, rk), SIMON128_f(x3)); + y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1)); + y2 = VecXor(VecXor(y2, rk), SIMON128_f(x2)); + y3 = VecXor(VecXor(y3, rk), SIMON128_f(x3)); rounds--; } for (int i = static_cast(rounds-2); i >= 0; i -= 2) { const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]); - x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk1); - x2 = VectorXor(VectorXor(x2, SIMON128_f(y2)), rk1); - x3 = VectorXor(VectorXor(x3, SIMON128_f(y3)), rk1); + x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1); + x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk1); + x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk1); const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]); - y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk2); - y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk2); - y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk2); + y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2); + y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk2); + y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk2); } #if (CRYPTOPP_BIG_ENDIAN) @@ -769,12 +770,12 @@ CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); - block2 = (uint32x4_p)vec_perm(x2, y2, m3); - block3 = (uint32x4_p)vec_perm(x2, y2, m4); - block4 = (uint32x4_p)vec_perm(x3, y3, m3); - block5 = (uint32x4_p)vec_perm(x3, y3, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); + block2 = (uint32x4_p)VecPermute(x2, y2, m3); + block3 = (uint32x4_p)VecPermute(x2, y2, m4); + block4 = (uint32x4_p)VecPermute(x3, y3, m3); + block5 = (uint32x4_p)VecPermute(x3, y3, m4); } #endif // CRYPTOPP_POWER8_AVAILABLE diff --git a/simon64_simd.cpp b/simon64_simd.cpp index 6b1a1db6..74bcf22b 100644 --- a/simon64_simd.cpp +++ b/simon64_simd.cpp @@ -538,10 +538,11 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1, using CryptoPP::uint8x16_p; using CryptoPP::uint32x4_p; -using CryptoPP::VectorAnd; -using CryptoPP::VectorXor; -using CryptoPP::VectorLoad; -using CryptoPP::VectorLoadBE; +using CryptoPP::VecAnd; +using CryptoPP::VecXor; +using CryptoPP::VecLoad; +using CryptoPP::VecLoadBE; +using CryptoPP::VecPermute; // Rotate left by bit count template @@ -561,8 +562,8 @@ CRYPTOPP_INLINE uint32x4_p RotateRight32(const uint32x4_p val) CRYPTOPP_INLINE uint32x4_p SIMON64_f(const uint32x4_p val) { - return VectorXor(RotateLeft32<2>(val), - VectorAnd(RotateLeft32<1>(val), RotateLeft32<8>(val))); + return VecXor(RotateLeft32<2>(val), + VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val))); } CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, @@ -577,8 +578,8 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = vec_perm(block0, block1, m1); - uint32x4_p y1 = vec_perm(block0, block1, m2); + uint32x4_p x1 = VecPermute(block0, block1, m1); + uint32x4_p y1 = VecPermute(block0, block1, m2); for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) { @@ -587,13 +588,13 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, const uint32x4_p rk2 = vec_splats(subkeys[i+1]); #else const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk1 = VectorLoad(subkeys+i); - uint32x4_p rk2 = VectorLoad(subkeys+i+1); - rk1 = vec_perm(rk1, rk1, m); - rk2 = vec_perm(rk2, rk2, m); + uint32x4_p rk1 = VecLoad(subkeys+i); + uint32x4_p rk2 = VecLoad(subkeys+i+1); + rk1 = VecPermute(rk1, rk1, m); + rk2 = VecPermute(rk2, rk2, m); #endif - y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1); - x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2); + y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1); + x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2); } if (rounds & 1) @@ -602,10 +603,10 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, const uint32x4_p rk = vec_splats(subkeys[rounds-1]); #else const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VectorLoad(subkeys+rounds-1); - rk = vec_perm(rk, rk, m); + uint32x4_p rk = VecLoad(subkeys+rounds-1); + rk = VecPermute(rk, rk, m); #endif - y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk); + y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk); std::swap(x1, y1); } @@ -618,8 +619,8 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); } CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, @@ -634,8 +635,8 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = vec_perm(block0, block1, m1); - uint32x4_p y1 = vec_perm(block0, block1, m2); + uint32x4_p x1 = VecPermute(block0, block1, m1); + uint32x4_p y1 = VecPermute(block0, block1, m2); if (rounds & 1) { @@ -644,10 +645,10 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, const uint32x4_p rk = vec_splats(subkeys[rounds-1]); #else const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VectorLoad(subkeys+rounds-1); - rk = vec_perm(rk, rk, m); + uint32x4_p rk = VecLoad(subkeys+rounds-1); + rk = VecPermute(rk, rk, m); #endif - y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1)); + y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1)); rounds--; } @@ -658,13 +659,13 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, const uint32x4_p rk2 = vec_splats(subkeys[i]); #else const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk1 = VectorLoad(subkeys+i+1); - uint32x4_p rk2 = VectorLoad(subkeys+i); - rk1 = vec_perm(rk1, rk1, m); - rk2 = vec_perm(rk2, rk2, m); + uint32x4_p rk1 = VecLoad(subkeys+i+1); + uint32x4_p rk2 = VecLoad(subkeys+i); + rk1 = VecPermute(rk1, rk1, m); + rk2 = VecPermute(rk2, rk2, m); #endif - x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1); - y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2); + x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1); + y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2); } #if (CRYPTOPP_BIG_ENDIAN) @@ -676,8 +677,8 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); } CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -693,12 +694,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1 #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1); - uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2); - uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1); - uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2); - uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1); - uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2); + uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1); + uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2); + uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1); + uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2); + uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1); + uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2); for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) { @@ -707,18 +708,18 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1 const uint32x4_p rk2 = vec_splats(subkeys[i+1]); #else const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk1 = VectorLoad(subkeys+i); - uint32x4_p rk2 = VectorLoad(subkeys+i+1); - rk1 = vec_perm(rk1, rk1, m); - rk2 = vec_perm(rk2, rk2, m); + uint32x4_p rk1 = VecLoad(subkeys+i); + uint32x4_p rk2 = VecLoad(subkeys+i+1); + rk1 = VecPermute(rk1, rk1, m); + rk2 = VecPermute(rk2, rk2, m); #endif - y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1); - y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk1); - y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk1); + y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1); + y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1); + y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1); - x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2); - x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk2); - x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk2); + x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2); + x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2); + x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2); } if (rounds & 1) @@ -727,12 +728,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1 const uint32x4_p rk = vec_splats(subkeys[rounds-1]); #else const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VectorLoad(subkeys+rounds-1); - rk = vec_perm(rk, rk, m); + uint32x4_p rk = VecLoad(subkeys+rounds-1); + rk = VecPermute(rk, rk, m); #endif - y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk); - y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk); - y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk); + y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk); + y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk); + y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk); std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); } @@ -745,12 +746,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1 #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); - block2 = (uint32x4_p)vec_perm(x2, y2, m3); - block3 = (uint32x4_p)vec_perm(x2, y2, m4); - block4 = (uint32x4_p)vec_perm(x3, y3, m3); - block5 = (uint32x4_p)vec_perm(x3, y3, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); + block2 = (uint32x4_p)VecPermute(x2, y2, m3); + block3 = (uint32x4_p)VecPermute(x2, y2, m4); + block4 = (uint32x4_p)VecPermute(x3, y3, m3); + block5 = (uint32x4_p)VecPermute(x3, y3, m4); } CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -766,12 +767,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1 #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1); - uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2); - uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1); - uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2); - uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1); - uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2); + uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1); + uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2); + uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1); + uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2); + uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1); + uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2); if (rounds & 1) { @@ -781,12 +782,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1 const uint32x4_p rk = vec_splats(subkeys[rounds-1]); #else const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VectorLoad(subkeys+rounds-1); - rk = vec_perm(rk, rk, m); + uint32x4_p rk = VecLoad(subkeys+rounds-1); + rk = VecPermute(rk, rk, m); #endif - y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1)); - y2 = VectorXor(VectorXor(y2, rk), SIMON64_f(x2)); - y3 = VectorXor(VectorXor(y3, rk), SIMON64_f(x3)); + y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1)); + y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2)); + y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3)); rounds--; } @@ -797,18 +798,18 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1 const uint32x4_p rk2 = vec_splats(subkeys[i]); #else const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk1 = VectorLoad(subkeys+i+1); - uint32x4_p rk2 = VectorLoad(subkeys+i); - rk1 = vec_perm(rk1, rk1, m); - rk2 = vec_perm(rk2, rk2, m); + uint32x4_p rk1 = VecLoad(subkeys+i+1); + uint32x4_p rk2 = VecLoad(subkeys+i); + rk1 = VecPermute(rk1, rk1, m); + rk2 = VecPermute(rk2, rk2, m); #endif - x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1); - x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk1); - x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk1); + x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1); + x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1); + x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1); - y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2); - y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk2); - y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk2); + y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2); + y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2); + y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2); } #if (CRYPTOPP_BIG_ENDIAN) @@ -820,12 +821,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1 #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); - block2 = (uint32x4_p)vec_perm(x2, y2, m3); - block3 = (uint32x4_p)vec_perm(x2, y2, m4); - block4 = (uint32x4_p)vec_perm(x3, y3, m3); - block5 = (uint32x4_p)vec_perm(x3, y3, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); + block2 = (uint32x4_p)VecPermute(x2, y2, m3); + block3 = (uint32x4_p)VecPermute(x2, y2, m4); + block4 = (uint32x4_p)VecPermute(x3, y3, m3); + block5 = (uint32x4_p)VecPermute(x3, y3, m4); } #endif // CRYPTOPP_ALTIVEC_AVAILABLE diff --git a/speck128_simd.cpp b/speck128_simd.cpp index dd1ef08c..d59b1b1e 100644 --- a/speck128_simd.cpp +++ b/speck128_simd.cpp @@ -479,9 +479,10 @@ using CryptoPP::uint8x16_p; using CryptoPP::uint32x4_p; using CryptoPP::uint64x2_p; -using CryptoPP::VectorAdd; -using CryptoPP::VectorSub; -using CryptoPP::VectorXor; +using CryptoPP::VecAdd; +using CryptoPP::VecSub; +using CryptoPP::VecXor; +using CryptoPP::VecPermute; // Rotate left by bit count template @@ -510,19 +511,19 @@ void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1); - uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2); + uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1); + uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2); for (int i=0; i < static_cast(rounds); ++i) { const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]); x1 = RotateRight64<8>(x1); - x1 = VectorAdd(x1, y1); - x1 = VectorXor(x1, rk); + x1 = VecAdd(x1, y1); + x1 = VecXor(x1, rk); y1 = RotateLeft64<3>(y1); - y1 = VectorXor(y1, x1); + y1 = VecXor(y1, x1); } #if (CRYPTOPP_BIG_ENDIAN) @@ -534,7 +535,7 @@ void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block = (uint32x4_p)vec_perm(x1, y1, m3); + block = (uint32x4_p)VecPermute(x1, y1, m3); } void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds) @@ -548,17 +549,17 @@ void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1); - uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2); + uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1); + uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2); for (int i = static_cast(rounds-1); i >= 0; --i) { const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]); - y1 = VectorXor(y1, x1); + y1 = VecXor(y1, x1); y1 = RotateRight64<3>(y1); - x1 = VectorXor(x1, rk); - x1 = VectorSub(x1, y1); + x1 = VecXor(x1, rk); + x1 = VecSub(x1, y1); x1 = RotateLeft64<8>(x1); } @@ -571,7 +572,7 @@ void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block = (uint32x4_p)vec_perm(x1, y1, m3); + block = (uint32x4_p)VecPermute(x1, y1, m3); } void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -587,12 +588,12 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1); - uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2); - uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1); - uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2); - uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1); - uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2); + uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1); + uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2); + uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1); + uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2); + uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1); + uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2); for (int i=0; i < static_cast(rounds); ++i) { @@ -601,19 +602,19 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, x1 = RotateRight64<8>(x1); x2 = RotateRight64<8>(x2); x3 = RotateRight64<8>(x3); - x1 = VectorAdd(x1, y1); - x2 = VectorAdd(x2, y2); - x3 = VectorAdd(x3, y3); - x1 = VectorXor(x1, rk); - x2 = VectorXor(x2, rk); - x3 = VectorXor(x3, rk); + x1 = VecAdd(x1, y1); + x2 = VecAdd(x2, y2); + x3 = VecAdd(x3, y3); + x1 = VecXor(x1, rk); + x2 = VecXor(x2, rk); + x3 = VecXor(x3, rk); y1 = RotateLeft64<3>(y1); y2 = RotateLeft64<3>(y2); y3 = RotateLeft64<3>(y3); - y1 = VectorXor(y1, x1); - y2 = VectorXor(y2, x2); - y3 = VectorXor(y3, x3); + y1 = VecXor(y1, x1); + y2 = VecXor(y2, x2); + y3 = VecXor(y3, x3); } #if (CRYPTOPP_BIG_ENDIAN) @@ -625,12 +626,12 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); - block2 = (uint32x4_p)vec_perm(x2, y2, m3); - block3 = (uint32x4_p)vec_perm(x2, y2, m4); - block4 = (uint32x4_p)vec_perm(x3, y3, m3); - block5 = (uint32x4_p)vec_perm(x3, y3, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); + block2 = (uint32x4_p)VecPermute(x2, y2, m3); + block3 = (uint32x4_p)VecPermute(x2, y2, m4); + block4 = (uint32x4_p)VecPermute(x3, y3, m3); + block5 = (uint32x4_p)VecPermute(x3, y3, m4); } void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -646,30 +647,30 @@ void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1); - uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2); - uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1); - uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2); - uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1); - uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2); + uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1); + uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2); + uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1); + uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2); + uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1); + uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2); for (int i = static_cast(rounds-1); i >= 0; --i) { const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]); - y1 = VectorXor(y1, x1); - y2 = VectorXor(y2, x2); - y3 = VectorXor(y3, x3); + y1 = VecXor(y1, x1); + y2 = VecXor(y2, x2); + y3 = VecXor(y3, x3); y1 = RotateRight64<3>(y1); y2 = RotateRight64<3>(y2); y3 = RotateRight64<3>(y3); - x1 = VectorXor(x1, rk); - x2 = VectorXor(x2, rk); - x3 = VectorXor(x3, rk); - x1 = VectorSub(x1, y1); - x2 = VectorSub(x2, y2); - x3 = VectorSub(x3, y3); + x1 = VecXor(x1, rk); + x2 = VecXor(x2, rk); + x3 = VecXor(x3, rk); + x1 = VecSub(x1, y1); + x2 = VecSub(x2, y2); + x3 = VecSub(x3, y3); x1 = RotateLeft64<8>(x1); x2 = RotateLeft64<8>(x2); x3 = RotateLeft64<8>(x3); @@ -684,12 +685,12 @@ void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); - block2 = (uint32x4_p)vec_perm(x2, y2, m3); - block3 = (uint32x4_p)vec_perm(x2, y2, m4); - block4 = (uint32x4_p)vec_perm(x3, y3, m3); - block5 = (uint32x4_p)vec_perm(x3, y3, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); + block2 = (uint32x4_p)VecPermute(x2, y2, m3); + block3 = (uint32x4_p)VecPermute(x2, y2, m4); + block4 = (uint32x4_p)VecPermute(x3, y3, m3); + block5 = (uint32x4_p)VecPermute(x3, y3, m4); } #endif // CRYPTOPP_POWER8_AVAILABLE diff --git a/speck64_simd.cpp b/speck64_simd.cpp index 0ed4f8d3..b8f8b2b2 100644 --- a/speck64_simd.cpp +++ b/speck64_simd.cpp @@ -483,10 +483,11 @@ CRYPTOPP_INLINE void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1, using CryptoPP::uint8x16_p; using CryptoPP::uint32x4_p; -using CryptoPP::VectorAdd; -using CryptoPP::VectorSub; -using CryptoPP::VectorXor; -using CryptoPP::VectorLoad; +using CryptoPP::VecAdd; +using CryptoPP::VecSub; +using CryptoPP::VecXor; +using CryptoPP::VecLoad; +using CryptoPP::VecPermute; // Rotate left by bit count template @@ -516,8 +517,8 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = vec_perm(block0, block1, m1); - uint32x4_p y1 = vec_perm(block0, block1, m2); + uint32x4_p x1 = VecPermute(block0, block1, m1); + uint32x4_p y1 = VecPermute(block0, block1, m2); for (int i=0; i < static_cast(rounds); ++i) { @@ -526,16 +527,16 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, #else // subkeys has extra elements so memory backs the last subkey const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VectorLoad(subkeys+i); - rk = vec_perm(rk, rk, m); + uint32x4_p rk = VecLoad(subkeys+i); + rk = VecPermute(rk, rk, m); #endif x1 = RotateRight32<8>(x1); - x1 = VectorAdd(x1, y1); - x1 = VectorXor(x1, rk); + x1 = VecAdd(x1, y1); + x1 = VecXor(x1, rk); y1 = RotateLeft32<3>(y1); - y1 = VectorXor(y1, x1); + y1 = VecXor(y1, x1); } #if (CRYPTOPP_BIG_ENDIAN) @@ -547,8 +548,8 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); } void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, @@ -563,8 +564,8 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = vec_perm(block0, block1, m1); - uint32x4_p y1 = vec_perm(block0, block1, m2); + uint32x4_p x1 = VecPermute(block0, block1, m1); + uint32x4_p y1 = VecPermute(block0, block1, m2); for (int i = static_cast(rounds-1); i >= 0; --i) { @@ -573,15 +574,15 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, #else // subkeys has extra elements so memory backs the last subkey const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VectorLoad(subkeys+i); - rk = vec_perm(rk, rk, m); + uint32x4_p rk = VecLoad(subkeys+i); + rk = VecPermute(rk, rk, m); #endif - y1 = VectorXor(y1, x1); + y1 = VecXor(y1, x1); y1 = RotateRight32<3>(y1); - x1 = VectorXor(x1, rk); - x1 = VectorSub(x1, y1); + x1 = VecXor(x1, rk); + x1 = VecSub(x1, y1); x1 = RotateLeft32<8>(x1); } @@ -594,8 +595,8 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); } void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -611,12 +612,12 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1); - uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2); - uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1); - uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2); - uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1); - uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2); + uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1); + uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2); + uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1); + uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2); + uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1); + uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2); for (int i=0; i < static_cast(rounds); ++i) { @@ -625,29 +626,29 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #else // subkeys has extra elements so memory backs the last subkey const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VectorLoad(subkeys+i); - rk = vec_perm(rk, rk, m); + uint32x4_p rk = VecLoad(subkeys+i); + rk = VecPermute(rk, rk, m); #endif x1 = RotateRight32<8>(x1); x2 = RotateRight32<8>(x2); x3 = RotateRight32<8>(x3); - x1 = VectorAdd(x1, y1); - x2 = VectorAdd(x2, y2); - x3 = VectorAdd(x3, y3); + x1 = VecAdd(x1, y1); + x2 = VecAdd(x2, y2); + x3 = VecAdd(x3, y3); - x1 = VectorXor(x1, rk); - x2 = VectorXor(x2, rk); - x3 = VectorXor(x3, rk); + x1 = VecXor(x1, rk); + x2 = VecXor(x2, rk); + x3 = VecXor(x3, rk); y1 = RotateLeft32<3>(y1); y2 = RotateLeft32<3>(y2); y3 = RotateLeft32<3>(y3); - y1 = VectorXor(y1, x1); - y2 = VectorXor(y2, x2); - y3 = VectorXor(y3, x3); + y1 = VecXor(y1, x1); + y2 = VecXor(y2, x2); + y3 = VecXor(y3, x3); } #if (CRYPTOPP_BIG_ENDIAN) @@ -659,12 +660,12 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); - block2 = (uint32x4_p)vec_perm(x2, y2, m3); - block3 = (uint32x4_p)vec_perm(x2, y2, m4); - block4 = (uint32x4_p)vec_perm(x3, y3, m3); - block5 = (uint32x4_p)vec_perm(x3, y3, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); + block2 = (uint32x4_p)VecPermute(x2, y2, m3); + block3 = (uint32x4_p)VecPermute(x2, y2, m4); + block4 = (uint32x4_p)VecPermute(x3, y3, m3); + block5 = (uint32x4_p)VecPermute(x3, y3, m4); } void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -680,12 +681,12 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1); - uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2); - uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1); - uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2); - uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1); - uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2); + uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1); + uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2); + uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1); + uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2); + uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1); + uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2); for (int i = static_cast(rounds-1); i >= 0; --i) { @@ -694,25 +695,25 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #else // subkeys has extra elements so memory backs the last subkey const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; - uint32x4_p rk = VectorLoad(subkeys+i); - rk = vec_perm(rk, rk, m); + uint32x4_p rk = VecLoad(subkeys+i); + rk = VecPermute(rk, rk, m); #endif - y1 = VectorXor(y1, x1); - y2 = VectorXor(y2, x2); - y3 = VectorXor(y3, x3); + y1 = VecXor(y1, x1); + y2 = VecXor(y2, x2); + y3 = VecXor(y3, x3); y1 = RotateRight32<3>(y1); y2 = RotateRight32<3>(y2); y3 = RotateRight32<3>(y3); - x1 = VectorXor(x1, rk); - x2 = VectorXor(x2, rk); - x3 = VectorXor(x3, rk); + x1 = VecXor(x1, rk); + x2 = VecXor(x2, rk); + x3 = VecXor(x3, rk); - x1 = VectorSub(x1, y1); - x2 = VectorSub(x2, y2); - x3 = VectorSub(x3, y3); + x1 = VecSub(x1, y1); + x2 = VecSub(x2, y2); + x3 = VecSub(x3, y3); x1 = RotateLeft32<8>(x1); x2 = RotateLeft32<8>(x2); @@ -728,12 +729,12 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, #endif // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - block0 = (uint32x4_p)vec_perm(x1, y1, m3); - block1 = (uint32x4_p)vec_perm(x1, y1, m4); - block2 = (uint32x4_p)vec_perm(x2, y2, m3); - block3 = (uint32x4_p)vec_perm(x2, y2, m4); - block4 = (uint32x4_p)vec_perm(x3, y3, m3); - block5 = (uint32x4_p)vec_perm(x3, y3, m4); + block0 = (uint32x4_p)VecPermute(x1, y1, m3); + block1 = (uint32x4_p)VecPermute(x1, y1, m4); + block2 = (uint32x4_p)VecPermute(x2, y2, m3); + block3 = (uint32x4_p)VecPermute(x2, y2, m4); + block4 = (uint32x4_p)VecPermute(x3, y3, m3); + block5 = (uint32x4_p)VecPermute(x3, y3, m4); } #endif // CRYPTOPP_ALTIVEC_AVAILABLE diff --git a/validat1.cpp b/validat1.cpp index 4432680f..39392b94 100644 --- a/validat1.cpp +++ b/validat1.cpp @@ -1089,44 +1089,44 @@ bool TestAltivecOps() const byte st2[16] ={21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6}; const byte st3[16] ={20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5}; - VectorStore(VectorLoad(src), dest); + VecStore(VecLoad(src), dest); pass1 = (0 == std::memcmp(src, dest, 16)) && pass1; CRYPTOPP_ASSERT(pass1); - VectorStore(VectorLoad(src+1), dest+1); + VecStore(VecLoad(src+1), dest+1); pass1 = (0 == std::memcmp(st1, dest+1, 16)) && pass1; CRYPTOPP_ASSERT(pass1); - VectorStore(VectorLoad(src+2), dest+2); + VecStore(VecLoad(src+2), dest+2); pass1 = (0 == std::memcmp(st2, dest+2, 16)) && pass1; CRYPTOPP_ASSERT(pass1); - VectorStore(VectorLoad(src+3), dest+3); + VecStore(VecLoad(src+3), dest+3); pass1 = (0 == std::memcmp(st3, dest+3, 16)) && pass1; CRYPTOPP_ASSERT(pass1); - VectorStoreBE(VectorLoadBE(src), dest); + VecStoreBE(VecLoadBE(src), dest); pass1 = (0 == std::memcmp(src, dest, 16)) && pass1; CRYPTOPP_ASSERT(pass1); - VectorStoreBE(VectorLoadBE(src+1), dest+1); + VecStoreBE(VecLoadBE(src+1), dest+1); pass1 = (0 == std::memcmp(st1, dest+1, 16)) && pass1; CRYPTOPP_ASSERT(pass1); - VectorStoreBE(VectorLoadBE(src+2), dest+2); + VecStoreBE(VecLoadBE(src+2), dest+2); pass1 = (0 == std::memcmp(st2, dest+2, 16)) && pass1; CRYPTOPP_ASSERT(pass1); - VectorStoreBE(VectorLoadBE(src+3), dest+3); + VecStoreBE(VecLoadBE(src+3), dest+3); pass1 = (0 == std::memcmp(st3, dest+3, 16)) && pass1; CRYPTOPP_ASSERT(pass1); #if (CRYPTOPP_LITTLE_ENDIAN) - VectorStore(VectorLoadBE(src), dest); + VecStore(VecLoadBE(src), dest); pass1 = (0 != std::memcmp(src, dest, 16)) && pass1; CRYPTOPP_ASSERT(pass1); - VectorStoreBE(VectorLoad(src), dest); + VecStoreBE(VecLoad(src), dest); pass1 = (0 != std::memcmp(src, dest, 16)) && pass1; CRYPTOPP_ASSERT(pass1); #endif @@ -1143,9 +1143,9 @@ bool TestAltivecOps() uint8x16_p val = {0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff}; - pass2 = (VectorEqual(val, VectorShiftLeftOctet<0>(val))) && pass2; + pass2 = (VecEqual(val, VecShiftLeftOctet<0>(val))) && pass2; CRYPTOPP_ASSERT(pass2); - pass2 = (VectorEqual(val, VectorShiftRightOctet<0>(val))) && pass2; + pass2 = (VecEqual(val, VecShiftRightOctet<0>(val))) && pass2; CRYPTOPP_ASSERT(pass2); uint8x16_p lsh1 = {0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, @@ -1153,9 +1153,9 @@ bool TestAltivecOps() uint8x16_p rsh1 = {0x00,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff}; - pass2 = (VectorEqual(lsh1, VectorShiftLeftOctet<1>(val))) && pass2; + pass2 = (VecEqual(lsh1, VecShiftLeftOctet<1>(val))) && pass2; CRYPTOPP_ASSERT(pass2); - pass2 = (VectorEqual(rsh1, VectorShiftRightOctet<1>(val))) && pass2; + pass2 = (VecEqual(rsh1, VecShiftRightOctet<1>(val))) && pass2; CRYPTOPP_ASSERT(pass2); uint8x16_p lsh15 = {0xff,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, @@ -1163,9 +1163,9 @@ bool TestAltivecOps() uint8x16_p rsh15 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xff}; - pass2 = (VectorEqual(lsh15, VectorShiftLeftOctet<15>(val))) && pass2; + pass2 = (VecEqual(lsh15, VecShiftLeftOctet<15>(val))) && pass2; CRYPTOPP_ASSERT(pass2); - pass2 = (VectorEqual(rsh15, VectorShiftRightOctet<15>(val))) && pass2; + pass2 = (VecEqual(rsh15, VecShiftRightOctet<15>(val))) && pass2; CRYPTOPP_ASSERT(pass2); uint8x16_p lsh16 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, @@ -1173,9 +1173,9 @@ bool TestAltivecOps() uint8x16_p rsh16 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}; - pass2 = (VectorEqual(lsh16, VectorShiftLeftOctet<16>(val))) && pass2; + pass2 = (VecEqual(lsh16, VecShiftLeftOctet<16>(val))) && pass2; CRYPTOPP_ASSERT(pass2); - pass2 = (VectorEqual(rsh16, VectorShiftRightOctet<16>(val))) && pass2; + pass2 = (VecEqual(rsh16, VecShiftRightOctet<16>(val))) && pass2; CRYPTOPP_ASSERT(pass2); if (!pass2) @@ -1194,16 +1194,16 @@ bool TestAltivecOps() uint8x16_p ex3 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x1f,0x1e,0x1d,0x1c, 0x1b,0x1a,0x19,0x18}; - pass3 = VectorEqual(ex2, VectorGetLow(ex1)) && pass3; + pass3 = VecEqual(ex2, VecGetLow(ex1)) && pass3; CRYPTOPP_ASSERT(pass3); - pass3 = VectorEqual(ex3, VectorGetHigh(ex1)) && pass3; + pass3 = VecEqual(ex3, VecGetHigh(ex1)) && pass3; CRYPTOPP_ASSERT(pass3); - uint8x16_p ex4 = VectorShiftRightOctet<8>(VectorShiftLeftOctet<8>(ex1)); - pass3 = VectorEqual(ex4, VectorGetLow(ex1)) && pass3; + uint8x16_p ex4 = VecShiftRightOctet<8>(VecShiftLeftOctet<8>(ex1)); + pass3 = VecEqual(ex4, VecGetLow(ex1)) && pass3; CRYPTOPP_ASSERT(pass3); - uint8x16_p ex5 = VectorShiftRightOctet<8>(ex1); - pass3 = VectorEqual(ex5, VectorGetHigh(ex1)) && pass3; + uint8x16_p ex5 = VecShiftRightOctet<8>(ex1); + pass3 = VecEqual(ex5, VecGetHigh(ex1)) && pass3; CRYPTOPP_ASSERT(pass3); if (!pass3)