Rename PPC vector functions from VectorFunc to VecFunc

This commit is contained in:
Jeffrey Walton 2018-11-15 15:17:49 -05:00
parent 8e5cd3637e
commit f6e04e5f33
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
15 changed files with 1140 additions and 1135 deletions

View File

@ -1857,54 +1857,54 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
// even harder without POWER8 due to lack of 64-bit elements.
std::memcpy(temp+LowOffset, inBlocks, 8);
std::memcpy(temp+HighOffset, inBlocks, 8);
uint32x4_p ctr = (uint32x4_p)VectorLoadBE(temp);
uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
// For 64-bit block ciphers we need to load the CTR block,
// which is 8 bytes. After the dup load we have two counters
// in the Altivec word. Then we need to increment the low ctr
// by 0 and the high ctr by 1.
block0 = VectorAdd(s_one, ctr);
block0 = VecAdd(s_one, ctr);
// After initial increment of {0,1} remaining counters
// increment by {2,2}.
block1 = VectorAdd(s_two, block0);
block2 = VectorAdd(s_two, block1);
block3 = VectorAdd(s_two, block2);
block4 = VectorAdd(s_two, block3);
block5 = VectorAdd(s_two, block4);
block1 = VecAdd(s_two, block0);
block2 = VecAdd(s_two, block1);
block3 = VecAdd(s_two, block2);
block4 = VecAdd(s_two, block3);
block5 = VecAdd(s_two, block4);
// Update the counter in the caller.
const_cast<byte*>(inBlocks)[7] += 12;
}
else
{
block0 = VectorLoadBE(inBlocks);
block0 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block1 = VectorLoadBE(inBlocks);
block1 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block2 = VectorLoadBE(inBlocks);
block2 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block3 = VectorLoadBE(inBlocks);
block3 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block4 = VectorLoadBE(inBlocks);
block4 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block5 = VectorLoadBE(inBlocks);
block5 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
}
if (xorInput)
{
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
block0 = VecXor(block0, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
block1 = VecXor(block1, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
block2 = VecXor(block2, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
block3 = VecXor(block3, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
block4 = VecXor(block4, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
block5 = VecXor(block5, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
@ -1912,31 +1912,31 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
if (xorOutput)
{
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
block0 = VecXor(block0, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
block1 = VecXor(block1, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
block2 = VecXor(block2, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
block3 = VecXor(block3, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
block4 = VecXor(block4, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
block5 = VecXor(block5, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
VectorStoreBE(block0, outBlocks);
VecStoreBE(block0, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block1, outBlocks);
VecStoreBE(block1, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block2, outBlocks);
VecStoreBE(block2, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block3, outBlocks);
VecStoreBE(block3, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block4, outBlocks);
VecStoreBE(block4, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block5, outBlocks);
VecStoreBE(block5, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
length -= 6*vsxBlockSize;
@ -1951,34 +1951,34 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
// even harder without POWER8 due to lack of 64-bit elements.
std::memcpy(temp+LowOffset, inBlocks, 8);
std::memcpy(temp+HighOffset, inBlocks, 8);
uint32x4_p ctr = (uint32x4_p)VectorLoadBE(temp);
uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
// For 64-bit block ciphers we need to load the CTR block,
// which is 8 bytes. After the dup load we have two counters
// in the Altivec word. Then we need to increment the low ctr
// by 0 and the high ctr by 1.
block0 = VectorAdd(s_one, ctr);
block0 = VecAdd(s_one, ctr);
// After initial increment of {0,1} remaining counters
// increment by {2,2}.
block1 = VectorAdd(s_two, block0);
block1 = VecAdd(s_two, block0);
// Update the counter in the caller.
const_cast<byte*>(inBlocks)[7] += 4;
}
else
{
block0 = VectorLoadBE(inBlocks);
block0 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block1 = VectorLoadBE(inBlocks);
block1 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
}
if (xorInput)
{
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
block0 = VecXor(block0, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
block1 = VecXor(block1, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
@ -1986,15 +1986,15 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
if (xorOutput)
{
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
block0 = VecXor(block0, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
block1 = VecXor(block1, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
VectorStoreBE(block0, outBlocks);
VecStoreBE(block0, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block1, outBlocks);
VecStoreBE(block1, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
length -= 2*vsxBlockSize;
@ -2030,14 +2030,14 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
// initialize the block then it generates warnings.
std::memcpy(temp+LowOffset, inBlocks, 8);
std::memcpy(temp+HighOffset, inBlocks, 8); // don't care
block = (uint32x4_p)VectorLoadBE(temp);
block = (uint32x4_p)VecLoadBE(temp);
if (xorInput)
{
std::memcpy(temp+LowOffset, xorBlocks, 8);
std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
uint32x4_p x = (uint32x4_p)VectorLoadBE(temp);
block = VectorXor(block, x);
uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
block = VecXor(block, x);
}
// Update the counter in the caller.
@ -2050,11 +2050,11 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
{
std::memcpy(temp+LowOffset, xorBlocks, 8);
std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
uint32x4_p x = (uint32x4_p)VectorLoadBE(temp);
block = VectorXor(block, x);
uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
block = VecXor(block, x);
}
VectorStoreBE(block, temp);
VecStoreBE(block, temp);
std::memcpy(outBlocks, temp+LowOffset, 8);
inBlocks = PtrAdd(inBlocks, inIncrement);
@ -2120,10 +2120,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
if (flags & BT_InBlockIsCounter)
{
block0 = VectorLoadBE(inBlocks);
block1 = VectorAdd(block0, s_one);
block2 = VectorAdd(block1, s_one);
block3 = VectorAdd(block2, s_one);
block0 = VecLoadBE(inBlocks);
block1 = VecAdd(block0, s_one);
block2 = VecAdd(block1, s_one);
block3 = VecAdd(block2, s_one);
// Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
// CTR_ModePolicy::OperateKeystream is wired such that after
@ -2137,25 +2137,25 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
}
else
{
block0 = VectorLoadBE(inBlocks);
block0 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block1 = VectorLoadBE(inBlocks);
block1 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block2 = VectorLoadBE(inBlocks);
block2 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block3 = VectorLoadBE(inBlocks);
block3 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
}
if (xorInput)
{
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
block0 = VecXor(block0, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
block1 = VecXor(block1, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
block2 = VecXor(block2, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
block3 = VecXor(block3, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
@ -2163,23 +2163,23 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
if (xorOutput)
{
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
block0 = VecXor(block0, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
block1 = VecXor(block1, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
block2 = VecXor(block2, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
block3 = VecXor(block3, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
VectorStoreBE(block0, outBlocks);
VecStoreBE(block0, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block1, outBlocks);
VecStoreBE(block1, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block2, outBlocks);
VecStoreBE(block2, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block3, outBlocks);
VecStoreBE(block3, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
length -= 4*blockSize;
@ -2188,10 +2188,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
while (length >= blockSize)
{
uint32x4_p block = VectorLoadBE(inBlocks);
uint32x4_p block = VecLoadBE(inBlocks);
if (xorInput)
block = VectorXor(block, VectorLoadBE(xorBlocks));
block = VecXor(block, VecLoadBE(xorBlocks));
if (flags & BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
@ -2199,9 +2199,9 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
func1(block, subKeys, rounds);
if (xorOutput)
block = VectorXor(block, VectorLoadBE(xorBlocks));
block = VecXor(block, VecLoadBE(xorBlocks));
VectorStoreBE(block, outBlocks);
VecStoreBE(block, outBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
outBlocks = PtrAdd(outBlocks, outIncrement);
@ -2265,12 +2265,12 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
if (flags & BT_InBlockIsCounter)
{
block0 = VectorLoadBE(inBlocks);
block1 = VectorAdd(block0, s_one);
block2 = VectorAdd(block1, s_one);
block3 = VectorAdd(block2, s_one);
block4 = VectorAdd(block3, s_one);
block5 = VectorAdd(block4, s_one);
block0 = VecLoadBE(inBlocks);
block1 = VecAdd(block0, s_one);
block2 = VecAdd(block1, s_one);
block3 = VecAdd(block2, s_one);
block4 = VecAdd(block3, s_one);
block5 = VecAdd(block4, s_one);
// Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
// CTR_ModePolicy::OperateKeystream is wired such that after
@ -2286,38 +2286,38 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
// the issue. If the last octet was 0xFC then 4 would trigger it.
// We dumb-lucked into the test with SPECK-128. The test case of
// interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
uint8x16_p temp = VectorAdd((uint8x16_p)block5, (uint8x16_p)s_one);
VectorStoreBE(temp, const_cast<byte*>(inBlocks));
uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
VecStoreBE(temp, const_cast<byte*>(inBlocks));
}
else
{
block0 = VectorLoadBE(inBlocks);
block0 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block1 = VectorLoadBE(inBlocks);
block1 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block2 = VectorLoadBE(inBlocks);
block2 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block3 = VectorLoadBE(inBlocks);
block3 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block4 = VectorLoadBE(inBlocks);
block4 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
block5 = VectorLoadBE(inBlocks);
block5 = VecLoadBE(inBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
}
if (xorInput)
{
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
block0 = VecXor(block0, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
block1 = VecXor(block1, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
block2 = VecXor(block2, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
block3 = VecXor(block3, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
block4 = VecXor(block4, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
block5 = VecXor(block5, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
@ -2325,31 +2325,31 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
if (xorOutput)
{
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
block0 = VecXor(block0, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
block1 = VecXor(block1, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
block2 = VecXor(block2, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
block3 = VecXor(block3, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
block4 = VecXor(block4, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
block5 = VecXor(block5, VecLoadBE(xorBlocks));
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
}
VectorStoreBE(block0, outBlocks);
VecStoreBE(block0, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block1, outBlocks);
VecStoreBE(block1, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block2, outBlocks);
VecStoreBE(block2, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block3, outBlocks);
VecStoreBE(block3, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block4, outBlocks);
VecStoreBE(block4, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
VectorStoreBE(block5, outBlocks);
VecStoreBE(block5, outBlocks);
outBlocks = PtrAdd(outBlocks, outIncrement);
length -= 6*blockSize;
@ -2358,10 +2358,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
while (length >= blockSize)
{
uint32x4_p block = VectorLoadBE(inBlocks);
uint32x4_p block = VecLoadBE(inBlocks);
if (xorInput)
block = VectorXor(block, VectorLoadBE(xorBlocks));
block = VecXor(block, VecLoadBE(xorBlocks));
if (flags & BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
@ -2369,9 +2369,9 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
func1(block, subKeys, rounds);
if (xorOutput)
block = VectorXor(block, VectorLoadBE(xorBlocks));
block = VecXor(block, VecLoadBE(xorBlocks));
VectorStoreBE(block, outBlocks);
VecStoreBE(block, outBlocks);
inBlocks = PtrAdd(inBlocks, inIncrement);
outBlocks = PtrAdd(outBlocks, outIncrement);

View File

@ -742,7 +742,7 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state)
#if (CRYPTOPP_POWER8_AVAILABLE)
inline uint64x2_p VectorLoad64(const void* p)
inline uint64x2_p VecLoad64(const void* p)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (uint64x2_p)vec_xl(0, (uint8_t*)p);
@ -751,18 +751,18 @@ inline uint64x2_p VectorLoad64(const void* p)
#endif
}
inline uint64x2_p VectorLoad64LE(const void* p)
inline uint64x2_p VecLoad64LE(const void* p)
{
#if __BIG_ENDIAN__
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
const uint64x2_p v = VectorLoad64(p);
return vec_perm(v, v, m);
const uint64x2_p v = VecLoad64(p);
return VecPermute(v, v, m);
#else
return VectorLoad64(p);
return VecLoad64(p);
#endif
}
inline void VectorStore64(void* p, const uint64x2_p x)
inline void VecStore64(void* p, const uint64x2_p x)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
vec_xst((uint8x16_p)x,0,(uint8_t*)p);
@ -771,18 +771,18 @@ inline void VectorStore64(void* p, const uint64x2_p x)
#endif
}
inline void VectorStore64LE(void* p, const uint64x2_p x)
inline void VecStore64LE(void* p, const uint64x2_p x)
{
#if __BIG_ENDIAN__
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
VectorStore64(p, vec_perm(x, x, m));
VecStore64(p, VecPermute(x, x, m));
#else
VectorStore64(p, x);
VecStore64(p, x);
#endif
}
template <unsigned int C>
inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
inline uint64x2_p VecShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
{
#if __BIG_ENDIAN__
return (uint64x2_p)vec_sld((uint8x16_p)a, (uint8x16_p)b, C);
@ -791,18 +791,18 @@ inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
#endif
}
#define vec_shl_octet(a,b,c) VectorShiftLeftOctet<c*8>(a, b)
#define vec_shl_octet(a,b,c) VecShiftLeftOctet<c*8>(a, b)
// vec_mergeh(a,b) is equivalent to vec_perm(a,b,HH_MASK); and
// vec_mergel(a,b) is equivalent vec_perm(a,b,LL_MASK). Benchmarks
// vec_mergeh(a,b) is equivalent to VecPermute(a,b,HH_MASK); and
// vec_mergel(a,b) is equivalent VecPermute(a,b,LL_MASK). Benchmarks
// show vec_mergeh and vec_mergel is faster on little-endian
// machines by 0.4 cpb. Benchmarks show vec_perm is faster on
// machines by 0.4 cpb. Benchmarks show VecPermute is faster on
// big-endian machines by 1.5 cpb. The code that uses
// vec_mergeh and vec_mergel is about 880 bytes shorter.
#if defined(__GNUC__) && (__BIG_ENDIAN__)
# define vec_merge_hi(a,b) vec_perm(a,b, HH_MASK)
# define vec_merge_lo(a,b) vec_perm(a,b, LL_MASK)
# define vec_merge_hi(a,b) VecPermute(a,b, HH_MASK)
# define vec_merge_lo(a,b) VecPermute(a,b, LL_MASK)
#else
# define vec_merge_hi(a,b) vec_mergeh(a,b)
# define vec_merge_lo(a,b) vec_mergel(a,b)
@ -878,12 +878,12 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
#define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
do { \
b0 = vec_merge_hi(m4, m0); \
b1 = vec_perm(m1, m6, HL_MASK); \
b1 = VecPermute(m1, m6, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
do { \
b0 = vec_perm(m5, m1, HL_MASK); \
b0 = VecPermute(m5, m1, HL_MASK); \
b1 = vec_merge_lo(m3, m4); \
} while(0)
@ -907,8 +907,8 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
#define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
do { \
b0 = vec_perm(m1, m2, HL_MASK); \
b1 = vec_perm(m2, m7, HL_MASK); \
b0 = VecPermute(m1, m2, HL_MASK); \
b1 = VecPermute(m2, m7, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
@ -925,20 +925,20 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
#define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
do { \
b0 = vec_perm(m0, m3, HL_MASK); \
b1 = vec_perm(m2, m7, HL_MASK); \
b0 = VecPermute(m0, m3, HL_MASK); \
b1 = VecPermute(m2, m7, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
do { \
b0 = vec_perm(m7, m5, HL_MASK); \
b1 = vec_perm(m3, m1, HL_MASK); \
b0 = VecPermute(m7, m5, HL_MASK); \
b1 = VecPermute(m3, m1, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
do { \
b0 = vec_shl_octet(m0, m6, 1); \
b1 = vec_perm(m4, m6, HL_MASK); \
b1 = VecPermute(m4, m6, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
@ -955,19 +955,19 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
#define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
do { \
b0 = vec_perm(m2, m3, HL_MASK); \
b0 = VecPermute(m2, m3, HL_MASK); \
b1 = vec_merge_lo(m7, m0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
do { \
b0 = vec_merge_lo(m6, m2); \
b1 = vec_perm(m7, m4, HL_MASK); \
b1 = VecPermute(m7, m4, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
do { \
b0 = vec_perm(m6, m0, HL_MASK); \
b0 = VecPermute(m6, m0, HL_MASK); \
b1 = vec_merge_hi(m7, m2); \
} while(0)
@ -986,13 +986,13 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
#define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
do { \
b0 = vec_merge_lo(m3, m1); \
b1 = vec_perm(m1, m5, HL_MASK); \
b1 = VecPermute(m1, m5, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
do { \
b0 = vec_merge_lo(m6, m3); \
b1 = vec_perm(m6, m1, HL_MASK); \
b1 = VecPermute(m6, m1, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
@ -1033,7 +1033,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
#define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
do { \
b0 = vec_perm(m1, m3, HL_MASK); \
b0 = VecPermute(m1, m3, HL_MASK); \
b1 = m2; \
} while(0)
@ -1046,7 +1046,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
#define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
do { \
b0 = vec_merge_hi(m1, m2); \
b1 = vec_perm(m3, m2, HL_MASK); \
b1 = VecPermute(m3, m2, HL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
@ -1122,23 +1122,23 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
#define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
do { \
row1l = vec_add(vec_add(row1l, b0), row2l); \
row1h = vec_add(vec_add(row1h, b1), row2h); \
row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \
row1l = VecAdd(VecAdd(row1l, b0), row2l); \
row1h = VecAdd(VecAdd(row1h, b1), row2h); \
row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \
row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \
row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \
row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \
row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \
row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \
row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \
} while(0)
#define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
do { \
row1l = vec_add(vec_add(row1l, b0), row2l); \
row1h = vec_add(vec_add(row1h, b1), row2h); \
row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \
row1l = VecAdd(VecAdd(row1l, b0), row2l); \
row1h = VecAdd(VecAdd(row1h, b1), row2h); \
row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \
row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \
row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \
row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \
row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \
row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \
row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \
} while(0)
@ -1175,27 +1175,27 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
} while(0)
const uint64x2_p m0 = VectorLoad64LE(input + 00);
const uint64x2_p m1 = VectorLoad64LE(input + 16);
const uint64x2_p m2 = VectorLoad64LE(input + 32);
const uint64x2_p m3 = VectorLoad64LE(input + 48);
const uint64x2_p m4 = VectorLoad64LE(input + 64);
const uint64x2_p m5 = VectorLoad64LE(input + 80);
const uint64x2_p m6 = VectorLoad64LE(input + 96);
const uint64x2_p m7 = VectorLoad64LE(input + 112);
const uint64x2_p m0 = VecLoad64LE(input + 00);
const uint64x2_p m1 = VecLoad64LE(input + 16);
const uint64x2_p m2 = VecLoad64LE(input + 32);
const uint64x2_p m3 = VecLoad64LE(input + 48);
const uint64x2_p m4 = VecLoad64LE(input + 64);
const uint64x2_p m5 = VecLoad64LE(input + 80);
const uint64x2_p m6 = VecLoad64LE(input + 96);
const uint64x2_p m7 = VecLoad64LE(input + 112);
uint64x2_p row1l, row1h, row2l, row2h;
uint64x2_p row3l, row3h, row4l, row4h;
const uint64x2_p h0 = row1l = VectorLoad64LE(&state.h[0]);
const uint64x2_p h1 = row1h = VectorLoad64LE(&state.h[2]);
const uint64x2_p h2 = row2l = VectorLoad64LE(&state.h[4]);
const uint64x2_p h3 = row2h = VectorLoad64LE(&state.h[6]);
const uint64x2_p h0 = row1l = VecLoad64LE(&state.h[0]);
const uint64x2_p h1 = row1h = VecLoad64LE(&state.h[2]);
const uint64x2_p h2 = row2l = VecLoad64LE(&state.h[4]);
const uint64x2_p h3 = row2h = VecLoad64LE(&state.h[6]);
row3l = VectorLoad64(&BLAKE2B_IV[0]);
row3h = VectorLoad64(&BLAKE2B_IV[2]);
row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.tf[0]));
row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.tf[2]));
row3l = VecLoad64(&BLAKE2B_IV[0]);
row3h = VecLoad64(&BLAKE2B_IV[2]);
row4l = VecXor(VecLoad64(&BLAKE2B_IV[4]), VecLoad64(&state.tf[0]));
row4h = VecXor(VecLoad64(&BLAKE2B_IV[6]), VecLoad64(&state.tf[2]));
BLAKE2B_ROUND(0);
BLAKE2B_ROUND(1);
@ -1210,10 +1210,10 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
BLAKE2B_ROUND(10);
BLAKE2B_ROUND(11);
VectorStore64LE(&state.h[0], vec_xor(h0, vec_xor(row1l, row3l)));
VectorStore64LE(&state.h[2], vec_xor(h1, vec_xor(row1h, row3h)));
VectorStore64LE(&state.h[4], vec_xor(h2, vec_xor(row2l, row4l)));
VectorStore64LE(&state.h[6], vec_xor(h3, vec_xor(row2h, row4h)));
VecStore64LE(&state.h[0], VecXor(h0, VecXor(row1l, row3l)));
VecStore64LE(&state.h[2], VecXor(h1, VecXor(row1h, row3h)));
VecStore64LE(&state.h[4], VecXor(h2, VecXor(row2l, row4l)));
VecStore64LE(&state.h[6], VecXor(h3, VecXor(row2h, row4h)));
}
#endif // CRYPTOPP_POWER8_AVAILABLE

View File

@ -683,34 +683,34 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
inline uint32x4_p VectorLoad32(const void* p)
inline uint32x4_p VecLoad32(const void* p)
{
return VectorLoad((const word32*)p);
return VecLoad((const word32*)p);
}
inline uint32x4_p VectorLoad32LE(const void* p)
inline uint32x4_p VecLoad32LE(const void* p)
{
#if __BIG_ENDIAN__
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
const uint32x4_p v = VectorLoad((const word32*)p);
return vec_perm(v, v, m);
const uint32x4_p v = VecLoad((const word32*)p);
return VecPermute(v, v, m);
#else
return VectorLoad((const word32*)p);
return VecLoad((const word32*)p);
#endif
}
inline void VectorStore32(void* p, const uint32x4_p x)
inline void VecStore32(void* p, const uint32x4_p x)
{
VectorStore(x, (word32*)p);
VecStore(x, (word32*)p);
}
inline void VectorStore32LE(void* p, const uint32x4_p x)
inline void VecStore32LE(void* p, const uint32x4_p x)
{
#if __BIG_ENDIAN__
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
VectorStore(vec_perm(x, x, m), (word32*)p);
VecStore(VecPermute(x, x, m), (word32*)p);
#else
VectorStore(x, (word32*)p);
VecStore(x, (word32*)p);
#endif
}
@ -718,7 +718,7 @@ template <unsigned int E1, unsigned int E2>
inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b)
{
// Re-index. I'd like to use something like Z=Y*4 and then
// VectorShiftLeftOctet<Z>(b) but it crashes early Red Hat
// VecShiftLeftOctet<Z>(b) but it crashes early Red Hat
// GCC compilers.
enum {X=E1&3, Y=E2&3};
@ -729,88 +729,88 @@ inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b)
if (X == 0 && Y == 0)
{
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, b, mask);
return VecPermute(a, b, mask);
}
else if (X == 0 && Y == 1)
{
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<4>(b), mask);
return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
}
else if (X == 0 && Y == 2)
{
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<8>(b), mask);
return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
}
else if (X == 0 && Y == 3)
{
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<12>(b), mask);
return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
}
// Element 1 combinations
else if (X == 1 && Y == 0)
{
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, b, mask);
return VecPermute(a, b, mask);
}
else if (X == 1 && Y == 1)
{
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<4>(b), mask);
return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
}
else if (X == 1 && Y == 2)
{
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<8>(b), mask);
return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
}
else if (X == 1 && Y == 3)
{
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<12>(b), mask);
return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
}
// Element 2 combinations
else if (X == 2 && Y == 0)
{
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, b, mask);
return VecPermute(a, b, mask);
}
else if (X == 2 && Y == 1)
{
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<4>(b), mask);
return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
}
else if (X == 2 && Y == 2)
{
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<8>(b), mask);
return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
}
else if (X == 2 && Y == 3)
{
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<12>(b), mask);
return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
}
// Element 3 combinations
else if (X == 3 && Y == 0)
{
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, b, mask);
return VecPermute(a, b, mask);
}
else if (X == 3 && Y == 1)
{
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<4>(b), mask);
return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
}
else if (X == 3 && Y == 2)
{
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<8>(b), mask);
return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
}
else if (X == 3 && Y == 3)
{
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
return vec_perm(a, VectorShiftLeftOctet<12>(b), mask);
return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
}
}
@ -826,7 +826,7 @@ inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b,
// Power7 follows SSE2's implementation, and this is _mm_set_epi32.
const uint8x16_p mask = {20,21,22,23, 16,17,18,19, 4,5,6,7, 0,1,2,3};
return vec_perm(t0, t1, mask);
return VecPermute(t0, t1, mask);
}
template<>
@ -835,7 +835,7 @@ uint32x4_p VectorSet32<2,0,2,0>(const uint32x4_p a, const uint32x4_p b,
{
// a=b, c=d, mask is {2,0, 2,0}
const uint8x16_p mask = {16,17,18,19, 24,25,26,27, 0,1,2,3, 8,9,10,11};
return vec_perm(a, c, mask);
return VecPermute(a, c, mask);
}
template<>
@ -844,7 +844,7 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b,
{
// a=b, c=d, mask is {3,1, 3,1}
const uint8x16_p mask = {20,21,22,23, 28,29,30,31, 4,5,6,7, 12,13,14,15};
return vec_perm(a, c, mask);
return VecPermute(a, c, mask);
}
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
@ -919,25 +919,25 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
#define BLAKE2S_LOAD_MSG_9_3(buf) buf = VectorSet32<13,3,9,15>(m13,m3,m9,m15)
#define BLAKE2S_LOAD_MSG_9_4(buf) buf = VectorSet32<0,12,14,11>(m0,m12,m14,m11)
#define vec_ror_16(x) VectorRotateRight<16>(x)
#define vec_ror_12(x) VectorRotateRight<12>(x)
#define vec_ror_8(x) VectorRotateRight<8>(x)
#define vec_ror_7(x) VectorRotateRight<7>(x)
#define vec_ror_16(x) VecRotateRight<16>(x)
#define vec_ror_12(x) VecRotateRight<12>(x)
#define vec_ror_8(x) VecRotateRight<8>(x)
#define vec_ror_7(x) VecRotateRight<7>(x)
#define BLAKE2S_G1(row1,row2,row3,row4,buf) \
row1 = vec_add(vec_add(row1, buf), row2); \
row4 = vec_xor(row4, row1); \
row1 = VecAdd(VecAdd(row1, buf), row2); \
row4 = VecXor(row4, row1); \
row4 = vec_ror_16(row4); \
row3 = vec_add(row3, row4); \
row2 = vec_xor(row2, row3); \
row3 = VecAdd(row3, row4); \
row2 = VecXor(row2, row3); \
row2 = vec_ror_12(row2);
#define BLAKE2S_G2(row1,row2,row3,row4,buf) \
row1 = vec_add(vec_add(row1, buf), row2); \
row4 = vec_xor(row4, row1); \
row1 = VecAdd(VecAdd(row1, buf), row2); \
row4 = VecXor(row4, row1); \
row4 = vec_ror_8(row4); \
row3 = vec_add(row3, row4); \
row2 = vec_xor(row2, row3); \
row3 = VecAdd(row3, row4); \
row2 = VecXor(row2, row3); \
row2 = vec_ror_7(row2);
const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
@ -945,14 +945,14 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
#define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \
row4 = vec_perm(row4, row4, D2103_MASK); \
row3 = vec_perm(row3, row3, D1032_MASK); \
row2 = vec_perm(row2, row2, D0321_MASK);
row4 = VecPermute(row4, row4, D2103_MASK); \
row3 = VecPermute(row3, row3, D1032_MASK); \
row2 = VecPermute(row2, row2, D0321_MASK);
#define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \
row4 = vec_perm(row4, row4, D0321_MASK); \
row3 = vec_perm(row3, row3, D1032_MASK); \
row2 = vec_perm(row2, row2, D2103_MASK);
row4 = VecPermute(row4, row4, D0321_MASK); \
row3 = VecPermute(row3, row3, D1032_MASK); \
row2 = VecPermute(row2, row2, D2103_MASK);
#define BLAKE2S_ROUND(r) \
BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
@ -970,15 +970,15 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
uint32x4_p buf1, buf2, buf3, buf4;
uint32x4_p ff0, ff1;
const uint32x4_p m0 = VectorLoad32LE(input + 0);
const uint32x4_p m4 = VectorLoad32LE(input + 16);
const uint32x4_p m8 = VectorLoad32LE(input + 32);
const uint32x4_p m12 = VectorLoad32LE(input + 48);
const uint32x4_p m0 = VecLoad32LE(input + 0);
const uint32x4_p m4 = VecLoad32LE(input + 16);
const uint32x4_p m8 = VecLoad32LE(input + 32);
const uint32x4_p m12 = VecLoad32LE(input + 48);
row1 = ff0 = VectorLoad32LE(&state.h[0]);
row2 = ff1 = VectorLoad32LE(&state.h[4]);
row3 = VectorLoad32(&BLAKE2S_IV[0]);
row4 = vec_xor(VectorLoad32(&BLAKE2S_IV[4]), VectorLoad32(&state.tf[0]));
row1 = ff0 = VecLoad32LE(&state.h[0]);
row2 = ff1 = VecLoad32LE(&state.h[4]);
row3 = VecLoad32(&BLAKE2S_IV[0]);
row4 = VecXor(VecLoad32(&BLAKE2S_IV[4]), VecLoad32(&state.tf[0]));
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
@ -991,8 +991,8 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
VectorStore32LE(&state.h[0], vec_xor(ff0, vec_xor(row1, row3)));
VectorStore32LE(&state.h[4], vec_xor(ff1, vec_xor(row2, row4)));
VecStore32LE(&state.h[0], VecXor(ff0, VecXor(row1, row3)));
VecStore32LE(&state.h[4], VecXor(ff1, VecXor(row2, row4)));
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE

View File

@ -206,7 +206,7 @@ inline __m128i RotateLeft<16>(const __m128i val)
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
// ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec
// is supported by using vec_ld and vec_st, and using a composite vec_add
// is supported by using vec_ld and vec_st, and using a composite VecAdd
// that supports 64-bit element adds. vec_ld and vec_st add significant
// overhead when memory is not aligned. Despite the drawbacks Altivec
// is profitable. The numbers for ChaCha8 are:
@ -216,33 +216,34 @@ inline __m128i RotateLeft<16>(const __m128i val)
using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::VectorLoad;
using CryptoPP::VectorStore;
using CryptoPP::VecLoad;
using CryptoPP::VecStore;
using CryptoPP::VecPermute;
// Permutes bytes in packed 32-bit words to little endian.
// State is already in proper endian order. Input and
// output must be permuted during load and save.
inline uint32x4_p VectorLoad32LE(const uint8_t src[16])
inline uint32x4_p VecLoad32LE(const uint8_t src[16])
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
const uint32x4_p val = VectorLoad(src);
return vec_perm(val, val, mask);
const uint32x4_p val = VecLoad(src);
return VecPermute(val, val, mask);
#else
return VectorLoad(src);
return VecLoad(src);
#endif
}
// Permutes bytes in packed 32-bit words to little endian.
// State is already in proper endian order. Input and
// output must be permuted during load and save.
inline void VectorStore32LE(uint8_t dest[16], const uint32x4_p& val)
inline void VecStore32LE(uint8_t dest[16], const uint32x4_p& val)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
VectorStore(vec_perm(val, val, mask), dest);
VecStore(VecPermute(val, val, mask), dest);
#else
return VectorStore(val, dest);
return VecStore(val, dest);
#endif
}
@ -262,21 +263,21 @@ template <>
inline uint32x4_p Shuffle<1>(const uint32x4_p& val)
{
const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
return vec_perm(val, val, mask);
return VecPermute(val, val, mask);
}
template <>
inline uint32x4_p Shuffle<2>(const uint32x4_p& val)
{
const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
return vec_perm(val, val, mask);
return VecPermute(val, val, mask);
}
template <>
inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
{
const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
return vec_perm(val, val, mask);
return VecPermute(val, val, mask);
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
@ -825,10 +826,10 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
const uint32x4_p state0 = VectorLoad(state + 0*4);
const uint32x4_p state1 = VectorLoad(state + 1*4);
const uint32x4_p state2 = VectorLoad(state + 2*4);
const uint32x4_p state3 = VectorLoad(state + 3*4);
const uint32x4_p state0 = VecLoad(state + 0*4);
const uint32x4_p state1 = VecLoad(state + 1*4);
const uint32x4_p state2 = VecLoad(state + 2*4);
const uint32x4_p state3 = VecLoad(state + 3*4);
const uint32x4_p CTRS[3] = {
{1,0,0,0}, {2,0,0,0}, {3,0,0,0}
@ -842,79 +843,79 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
uint32x4_p r1_0 = state0;
uint32x4_p r1_1 = state1;
uint32x4_p r1_2 = state2;
uint32x4_p r1_3 = VectorAdd64(r0_3, CTRS[0]);
uint32x4_p r1_3 = VecAdd64(r0_3, CTRS[0]);
uint32x4_p r2_0 = state0;
uint32x4_p r2_1 = state1;
uint32x4_p r2_2 = state2;
uint32x4_p r2_3 = VectorAdd64(r0_3, CTRS[1]);
uint32x4_p r2_3 = VecAdd64(r0_3, CTRS[1]);
uint32x4_p r3_0 = state0;
uint32x4_p r3_1 = state1;
uint32x4_p r3_2 = state2;
uint32x4_p r3_3 = VectorAdd64(r0_3, CTRS[2]);
uint32x4_p r3_3 = VecAdd64(r0_3, CTRS[2]);
for (int i = static_cast<int>(rounds); i > 0; i -= 2)
{
r0_0 = VectorAdd(r0_0, r0_1);
r1_0 = VectorAdd(r1_0, r1_1);
r2_0 = VectorAdd(r2_0, r2_1);
r3_0 = VectorAdd(r3_0, r3_1);
r0_0 = VecAdd(r0_0, r0_1);
r1_0 = VecAdd(r1_0, r1_1);
r2_0 = VecAdd(r2_0, r2_1);
r3_0 = VecAdd(r3_0, r3_1);
r0_3 = VectorXor(r0_3, r0_0);
r1_3 = VectorXor(r1_3, r1_0);
r2_3 = VectorXor(r2_3, r2_0);
r3_3 = VectorXor(r3_3, r3_0);
r0_3 = VecXor(r0_3, r0_0);
r1_3 = VecXor(r1_3, r1_0);
r2_3 = VecXor(r2_3, r2_0);
r3_3 = VecXor(r3_3, r3_0);
r0_3 = VectorRotateLeft<16>(r0_3);
r1_3 = VectorRotateLeft<16>(r1_3);
r2_3 = VectorRotateLeft<16>(r2_3);
r3_3 = VectorRotateLeft<16>(r3_3);
r0_3 = VecRotateLeft<16>(r0_3);
r1_3 = VecRotateLeft<16>(r1_3);
r2_3 = VecRotateLeft<16>(r2_3);
r3_3 = VecRotateLeft<16>(r3_3);
r0_2 = VectorAdd(r0_2, r0_3);
r1_2 = VectorAdd(r1_2, r1_3);
r2_2 = VectorAdd(r2_2, r2_3);
r3_2 = VectorAdd(r3_2, r3_3);
r0_2 = VecAdd(r0_2, r0_3);
r1_2 = VecAdd(r1_2, r1_3);
r2_2 = VecAdd(r2_2, r2_3);
r3_2 = VecAdd(r3_2, r3_3);
r0_1 = VectorXor(r0_1, r0_2);
r1_1 = VectorXor(r1_1, r1_2);
r2_1 = VectorXor(r2_1, r2_2);
r3_1 = VectorXor(r3_1, r3_2);
r0_1 = VecXor(r0_1, r0_2);
r1_1 = VecXor(r1_1, r1_2);
r2_1 = VecXor(r2_1, r2_2);
r3_1 = VecXor(r3_1, r3_2);
r0_1 = VectorRotateLeft<12>(r0_1);
r1_1 = VectorRotateLeft<12>(r1_1);
r2_1 = VectorRotateLeft<12>(r2_1);
r3_1 = VectorRotateLeft<12>(r3_1);
r0_1 = VecRotateLeft<12>(r0_1);
r1_1 = VecRotateLeft<12>(r1_1);
r2_1 = VecRotateLeft<12>(r2_1);
r3_1 = VecRotateLeft<12>(r3_1);
r0_0 = VectorAdd(r0_0, r0_1);
r1_0 = VectorAdd(r1_0, r1_1);
r2_0 = VectorAdd(r2_0, r2_1);
r3_0 = VectorAdd(r3_0, r3_1);
r0_0 = VecAdd(r0_0, r0_1);
r1_0 = VecAdd(r1_0, r1_1);
r2_0 = VecAdd(r2_0, r2_1);
r3_0 = VecAdd(r3_0, r3_1);
r0_3 = VectorXor(r0_3, r0_0);
r1_3 = VectorXor(r1_3, r1_0);
r2_3 = VectorXor(r2_3, r2_0);
r3_3 = VectorXor(r3_3, r3_0);
r0_3 = VecXor(r0_3, r0_0);
r1_3 = VecXor(r1_3, r1_0);
r2_3 = VecXor(r2_3, r2_0);
r3_3 = VecXor(r3_3, r3_0);
r0_3 = VectorRotateLeft<8>(r0_3);
r1_3 = VectorRotateLeft<8>(r1_3);
r2_3 = VectorRotateLeft<8>(r2_3);
r3_3 = VectorRotateLeft<8>(r3_3);
r0_3 = VecRotateLeft<8>(r0_3);
r1_3 = VecRotateLeft<8>(r1_3);
r2_3 = VecRotateLeft<8>(r2_3);
r3_3 = VecRotateLeft<8>(r3_3);
r0_2 = VectorAdd(r0_2, r0_3);
r1_2 = VectorAdd(r1_2, r1_3);
r2_2 = VectorAdd(r2_2, r2_3);
r3_2 = VectorAdd(r3_2, r3_3);
r0_2 = VecAdd(r0_2, r0_3);
r1_2 = VecAdd(r1_2, r1_3);
r2_2 = VecAdd(r2_2, r2_3);
r3_2 = VecAdd(r3_2, r3_3);
r0_1 = VectorXor(r0_1, r0_2);
r1_1 = VectorXor(r1_1, r1_2);
r2_1 = VectorXor(r2_1, r2_2);
r3_1 = VectorXor(r3_1, r3_2);
r0_1 = VecXor(r0_1, r0_2);
r1_1 = VecXor(r1_1, r1_2);
r2_1 = VecXor(r2_1, r2_2);
r3_1 = VecXor(r3_1, r3_2);
r0_1 = VectorRotateLeft<7>(r0_1);
r1_1 = VectorRotateLeft<7>(r1_1);
r2_1 = VectorRotateLeft<7>(r2_1);
r3_1 = VectorRotateLeft<7>(r3_1);
r0_1 = VecRotateLeft<7>(r0_1);
r1_1 = VecRotateLeft<7>(r1_1);
r2_1 = VecRotateLeft<7>(r2_1);
r3_1 = VecRotateLeft<7>(r3_1);
r0_1 = Shuffle<1>(r0_1);
r0_2 = Shuffle<2>(r0_2);
@ -932,65 +933,65 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
r3_2 = Shuffle<2>(r3_2);
r3_3 = Shuffle<3>(r3_3);
r0_0 = VectorAdd(r0_0, r0_1);
r1_0 = VectorAdd(r1_0, r1_1);
r2_0 = VectorAdd(r2_0, r2_1);
r3_0 = VectorAdd(r3_0, r3_1);
r0_0 = VecAdd(r0_0, r0_1);
r1_0 = VecAdd(r1_0, r1_1);
r2_0 = VecAdd(r2_0, r2_1);
r3_0 = VecAdd(r3_0, r3_1);
r0_3 = VectorXor(r0_3, r0_0);
r1_3 = VectorXor(r1_3, r1_0);
r2_3 = VectorXor(r2_3, r2_0);
r3_3 = VectorXor(r3_3, r3_0);
r0_3 = VecXor(r0_3, r0_0);
r1_3 = VecXor(r1_3, r1_0);
r2_3 = VecXor(r2_3, r2_0);
r3_3 = VecXor(r3_3, r3_0);
r0_3 = VectorRotateLeft<16>(r0_3);
r1_3 = VectorRotateLeft<16>(r1_3);
r2_3 = VectorRotateLeft<16>(r2_3);
r3_3 = VectorRotateLeft<16>(r3_3);
r0_3 = VecRotateLeft<16>(r0_3);
r1_3 = VecRotateLeft<16>(r1_3);
r2_3 = VecRotateLeft<16>(r2_3);
r3_3 = VecRotateLeft<16>(r3_3);
r0_2 = VectorAdd(r0_2, r0_3);
r1_2 = VectorAdd(r1_2, r1_3);
r2_2 = VectorAdd(r2_2, r2_3);
r3_2 = VectorAdd(r3_2, r3_3);
r0_2 = VecAdd(r0_2, r0_3);
r1_2 = VecAdd(r1_2, r1_3);
r2_2 = VecAdd(r2_2, r2_3);
r3_2 = VecAdd(r3_2, r3_3);
r0_1 = VectorXor(r0_1, r0_2);
r1_1 = VectorXor(r1_1, r1_2);
r2_1 = VectorXor(r2_1, r2_2);
r3_1 = VectorXor(r3_1, r3_2);
r0_1 = VecXor(r0_1, r0_2);
r1_1 = VecXor(r1_1, r1_2);
r2_1 = VecXor(r2_1, r2_2);
r3_1 = VecXor(r3_1, r3_2);
r0_1 = VectorRotateLeft<12>(r0_1);
r1_1 = VectorRotateLeft<12>(r1_1);
r2_1 = VectorRotateLeft<12>(r2_1);
r3_1 = VectorRotateLeft<12>(r3_1);
r0_1 = VecRotateLeft<12>(r0_1);
r1_1 = VecRotateLeft<12>(r1_1);
r2_1 = VecRotateLeft<12>(r2_1);
r3_1 = VecRotateLeft<12>(r3_1);
r0_0 = VectorAdd(r0_0, r0_1);
r1_0 = VectorAdd(r1_0, r1_1);
r2_0 = VectorAdd(r2_0, r2_1);
r3_0 = VectorAdd(r3_0, r3_1);
r0_0 = VecAdd(r0_0, r0_1);
r1_0 = VecAdd(r1_0, r1_1);
r2_0 = VecAdd(r2_0, r2_1);
r3_0 = VecAdd(r3_0, r3_1);
r0_3 = VectorXor(r0_3, r0_0);
r1_3 = VectorXor(r1_3, r1_0);
r2_3 = VectorXor(r2_3, r2_0);
r3_3 = VectorXor(r3_3, r3_0);
r0_3 = VecXor(r0_3, r0_0);
r1_3 = VecXor(r1_3, r1_0);
r2_3 = VecXor(r2_3, r2_0);
r3_3 = VecXor(r3_3, r3_0);
r0_3 = VectorRotateLeft<8>(r0_3);
r1_3 = VectorRotateLeft<8>(r1_3);
r2_3 = VectorRotateLeft<8>(r2_3);
r3_3 = VectorRotateLeft<8>(r3_3);
r0_3 = VecRotateLeft<8>(r0_3);
r1_3 = VecRotateLeft<8>(r1_3);
r2_3 = VecRotateLeft<8>(r2_3);
r3_3 = VecRotateLeft<8>(r3_3);
r0_2 = VectorAdd(r0_2, r0_3);
r1_2 = VectorAdd(r1_2, r1_3);
r2_2 = VectorAdd(r2_2, r2_3);
r3_2 = VectorAdd(r3_2, r3_3);
r0_2 = VecAdd(r0_2, r0_3);
r1_2 = VecAdd(r1_2, r1_3);
r2_2 = VecAdd(r2_2, r2_3);
r3_2 = VecAdd(r3_2, r3_3);
r0_1 = VectorXor(r0_1, r0_2);
r1_1 = VectorXor(r1_1, r1_2);
r2_1 = VectorXor(r2_1, r2_2);
r3_1 = VectorXor(r3_1, r3_2);
r0_1 = VecXor(r0_1, r0_2);
r1_1 = VecXor(r1_1, r1_2);
r2_1 = VecXor(r2_1, r2_2);
r3_1 = VecXor(r3_1, r3_2);
r0_1 = VectorRotateLeft<7>(r0_1);
r1_1 = VectorRotateLeft<7>(r1_1);
r2_1 = VectorRotateLeft<7>(r2_1);
r3_1 = VectorRotateLeft<7>(r3_1);
r0_1 = VecRotateLeft<7>(r0_1);
r1_1 = VecRotateLeft<7>(r1_1);
r2_1 = VecRotateLeft<7>(r2_1);
r3_1 = VecRotateLeft<7>(r3_1);
r0_1 = Shuffle<3>(r0_1);
r0_2 = Shuffle<2>(r0_2);
@ -1009,80 +1010,80 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
r3_3 = Shuffle<1>(r3_3);
}
r0_0 = VectorAdd(r0_0, state0);
r0_1 = VectorAdd(r0_1, state1);
r0_2 = VectorAdd(r0_2, state2);
r0_3 = VectorAdd(r0_3, state3);
r0_0 = VecAdd(r0_0, state0);
r0_1 = VecAdd(r0_1, state1);
r0_2 = VecAdd(r0_2, state2);
r0_3 = VecAdd(r0_3, state3);
r1_0 = VectorAdd(r1_0, state0);
r1_1 = VectorAdd(r1_1, state1);
r1_2 = VectorAdd(r1_2, state2);
r1_3 = VectorAdd(r1_3, state3);
r1_3 = VectorAdd64(r1_3, CTRS[0]);
r1_0 = VecAdd(r1_0, state0);
r1_1 = VecAdd(r1_1, state1);
r1_2 = VecAdd(r1_2, state2);
r1_3 = VecAdd(r1_3, state3);
r1_3 = VecAdd64(r1_3, CTRS[0]);
r2_0 = VectorAdd(r2_0, state0);
r2_1 = VectorAdd(r2_1, state1);
r2_2 = VectorAdd(r2_2, state2);
r2_3 = VectorAdd(r2_3, state3);
r2_3 = VectorAdd64(r2_3, CTRS[1]);
r2_0 = VecAdd(r2_0, state0);
r2_1 = VecAdd(r2_1, state1);
r2_2 = VecAdd(r2_2, state2);
r2_3 = VecAdd(r2_3, state3);
r2_3 = VecAdd64(r2_3, CTRS[1]);
r3_0 = VectorAdd(r3_0, state0);
r3_1 = VectorAdd(r3_1, state1);
r3_2 = VectorAdd(r3_2, state2);
r3_3 = VectorAdd(r3_3, state3);
r3_3 = VectorAdd64(r3_3, CTRS[2]);
r3_0 = VecAdd(r3_0, state0);
r3_1 = VecAdd(r3_1, state1);
r3_2 = VecAdd(r3_2, state2);
r3_3 = VecAdd(r3_3, state3);
r3_3 = VecAdd64(r3_3, CTRS[2]);
if (input)
{
r0_0 = VectorXor(VectorLoad32LE(input + 0*16), r0_0);
r0_1 = VectorXor(VectorLoad32LE(input + 1*16), r0_1);
r0_2 = VectorXor(VectorLoad32LE(input + 2*16), r0_2);
r0_3 = VectorXor(VectorLoad32LE(input + 3*16), r0_3);
r0_0 = VecXor(VecLoad32LE(input + 0*16), r0_0);
r0_1 = VecXor(VecLoad32LE(input + 1*16), r0_1);
r0_2 = VecXor(VecLoad32LE(input + 2*16), r0_2);
r0_3 = VecXor(VecLoad32LE(input + 3*16), r0_3);
}
VectorStore32LE(output + 0*16, r0_0);
VectorStore32LE(output + 1*16, r0_1);
VectorStore32LE(output + 2*16, r0_2);
VectorStore32LE(output + 3*16, r0_3);
VecStore32LE(output + 0*16, r0_0);
VecStore32LE(output + 1*16, r0_1);
VecStore32LE(output + 2*16, r0_2);
VecStore32LE(output + 3*16, r0_3);
if (input)
{
r1_0 = VectorXor(VectorLoad32LE(input + 4*16), r1_0);
r1_1 = VectorXor(VectorLoad32LE(input + 5*16), r1_1);
r1_2 = VectorXor(VectorLoad32LE(input + 6*16), r1_2);
r1_3 = VectorXor(VectorLoad32LE(input + 7*16), r1_3);
r1_0 = VecXor(VecLoad32LE(input + 4*16), r1_0);
r1_1 = VecXor(VecLoad32LE(input + 5*16), r1_1);
r1_2 = VecXor(VecLoad32LE(input + 6*16), r1_2);
r1_3 = VecXor(VecLoad32LE(input + 7*16), r1_3);
}
VectorStore32LE(output + 4*16, r1_0);
VectorStore32LE(output + 5*16, r1_1);
VectorStore32LE(output + 6*16, r1_2);
VectorStore32LE(output + 7*16, r1_3);
VecStore32LE(output + 4*16, r1_0);
VecStore32LE(output + 5*16, r1_1);
VecStore32LE(output + 6*16, r1_2);
VecStore32LE(output + 7*16, r1_3);
if (input)
{
r2_0 = VectorXor(VectorLoad32LE(input + 8*16), r2_0);
r2_1 = VectorXor(VectorLoad32LE(input + 9*16), r2_1);
r2_2 = VectorXor(VectorLoad32LE(input + 10*16), r2_2);
r2_3 = VectorXor(VectorLoad32LE(input + 11*16), r2_3);
r2_0 = VecXor(VecLoad32LE(input + 8*16), r2_0);
r2_1 = VecXor(VecLoad32LE(input + 9*16), r2_1);
r2_2 = VecXor(VecLoad32LE(input + 10*16), r2_2);
r2_3 = VecXor(VecLoad32LE(input + 11*16), r2_3);
}
VectorStore32LE(output + 8*16, r2_0);
VectorStore32LE(output + 9*16, r2_1);
VectorStore32LE(output + 10*16, r2_2);
VectorStore32LE(output + 11*16, r2_3);
VecStore32LE(output + 8*16, r2_0);
VecStore32LE(output + 9*16, r2_1);
VecStore32LE(output + 10*16, r2_2);
VecStore32LE(output + 11*16, r2_3);
if (input)
{
r3_0 = VectorXor(VectorLoad32LE(input + 12*16), r3_0);
r3_1 = VectorXor(VectorLoad32LE(input + 13*16), r3_1);
r3_2 = VectorXor(VectorLoad32LE(input + 14*16), r3_2);
r3_3 = VectorXor(VectorLoad32LE(input + 15*16), r3_3);
r3_0 = VecXor(VecLoad32LE(input + 12*16), r3_0);
r3_1 = VecXor(VecLoad32LE(input + 13*16), r3_1);
r3_2 = VecXor(VecLoad32LE(input + 14*16), r3_2);
r3_3 = VecXor(VecLoad32LE(input + 15*16), r3_3);
}
VectorStore32LE(output + 12*16, r3_0);
VectorStore32LE(output + 13*16, r3_1);
VectorStore32LE(output + 14*16, r3_2);
VectorStore32LE(output + 15*16, r3_3);
VecStore32LE(output + 12*16, r3_0);
VecStore32LE(output + 13*16, r3_1);
VecStore32LE(output + 14*16, r3_2);
VecStore32LE(output + 15*16, r3_3);
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE

View File

@ -171,16 +171,16 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
#if CRYPTOPP_POWER8_VMULL_AVAILABLE
using CryptoPP::uint32x4_p;
using CryptoPP::uint64x2_p;
using CryptoPP::VectorGetLow;
using CryptoPP::VectorGetHigh;
using CryptoPP::VectorRotateLeftOctet;
using CryptoPP::VecGetLow;
using CryptoPP::VecGetHigh;
using CryptoPP::VecRotateLeftOctet;
// POWER8 GCM mode is confusing. The algorithm is reflected so
// nearly everything we do is reversed for a little-endian system,
// including on big-endian machines. VMULL2LE swaps dwords for a
// little endian machine; VMULL_00LE, VMULL_01LE, VMULL_10LE and
// VMULL_11LE are backwards and (1) read low words with
// VectorGetHigh, (2) read high words with VectorGetLow, and
// VecGetHigh, (2) read high words with VecGetLow, and
// (3) yields a product that is endian swapped. The steps ensures
// GCM parameters are presented in the correct order for the
// algorithm on both big and little-endian systems, but it is
@ -192,7 +192,7 @@ using CryptoPP::VectorRotateLeftOctet;
inline uint64x2_p VMULL2LE(const uint64x2_p& val)
{
#if (CRYPTOPP_BIG_ENDIAN)
return VectorRotateLeftOctet<8>(val);
return VecRotateLeftOctet<8>(val);
#else
return val;
#endif
@ -202,48 +202,48 @@ inline uint64x2_p VMULL2LE(const uint64x2_p& val)
inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return VMULL2LE(__vpmsumd (VectorGetHigh(a), VectorGetHigh(b)));
return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b)));
#else
return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetHigh(a), VectorGetHigh(b)));
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b)));
#endif
}
// _mm_clmulepi64_si128(a, b, 0x01)
inline uint64x2_p VMULL_01LE(const uint64x2_p& a, const uint64x2_p& b)
{
// Small speedup. VectorGetHigh(b) ensures the high dword of 'b' is 0.
// Small speedup. VecGetHigh(b) ensures the high dword of 'b' is 0.
// The 0 used in the vmull yields 0 for the high product, so the high
// dword of 'a' is "don't care".
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return VMULL2LE(__vpmsumd (a, VectorGetHigh(b)));
return VMULL2LE(__vpmsumd (a, VecGetHigh(b)));
#else
return VMULL2LE(__builtin_crypto_vpmsumd (a, VectorGetHigh(b)));
return VMULL2LE(__builtin_crypto_vpmsumd (a, VecGetHigh(b)));
#endif
}
// _mm_clmulepi64_si128(a, b, 0x10)
inline uint64x2_p VMULL_10LE(const uint64x2_p& a, const uint64x2_p& b)
{
// Small speedup. VectorGetHigh(a) ensures the high dword of 'a' is 0.
// Small speedup. VecGetHigh(a) ensures the high dword of 'a' is 0.
// The 0 used in the vmull yields 0 for the high product, so the high
// dword of 'b' is "don't care".
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return VMULL2LE(__vpmsumd (VectorGetHigh(a), b));
return VMULL2LE(__vpmsumd (VecGetHigh(a), b));
#else
return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetHigh(a), b));
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), b));
#endif
}
// _mm_clmulepi64_si128(a, b, 0x11)
inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b)
{
// Small speedup. VectorGetLow(a) ensures the high dword of 'a' is 0.
// Small speedup. VecGetLow(a) ensures the high dword of 'a' is 0.
// The 0 used in the vmull yields 0 for the high product, so the high
// dword of 'b' is "don't care".
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return VMULL2LE(__vpmsumd (VectorGetLow(a), b));
return VMULL2LE(__vpmsumd (VecGetLow(a), b));
#else
return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetLow(a), b));
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b));
#endif
}
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
@ -373,7 +373,7 @@ bool CPU_ProbePMULL()
const uint64x2_p r3 = VMULL_10LE((uint64x2_p)(a), (uint64x2_p)(b));
const uint64x2_p r4 = VMULL_11LE((uint64x2_p)(a), (uint64x2_p)(b));
result = VectorNotEqual(r1, r2) && VectorNotEqual(r3, r4);
result = VecNotEqual(r1, r2) && VecNotEqual(r3, r4);
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
@ -743,7 +743,7 @@ void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer)
#if CRYPTOPP_ALTIVEC_AVAILABLE
void GCM_Xor16_ALTIVEC(byte *a, const byte *b, const byte *c)
{
VectorStore(VectorXor(VectorLoad(b), VectorLoad(c)), a);
VecStore(VecXor(VecLoad(b), VecLoad(c)), a);
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
@ -753,22 +753,22 @@ uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x
{
const uint64x2_p m1 = {1,1}, m63 = {63,63};
c1 = VectorXor(c1, VectorShiftRightOctet<8>(c0));
c1 = VectorXor(c1, VMULL_10LE(c0, r));
c0 = VectorXor(c1, VectorShiftLeftOctet<8>(c0));
c1 = VecXor(c1, VecShiftRightOctet<8>(c0));
c1 = VecXor(c1, VMULL_10LE(c0, r));
c0 = VecXor(c1, VecShiftLeftOctet<8>(c0));
c0 = VMULL_00LE(vec_sl(c0, m1), r);
c2 = VectorXor(c2, c0);
c2 = VectorXor(c2, VectorShiftLeftOctet<8>(c1));
c2 = VecXor(c2, c0);
c2 = VecXor(c2, VecShiftLeftOctet<8>(c1));
c1 = vec_sr(vec_mergeh(c1, c2), m63);
c2 = vec_sl(c2, m1);
return VectorXor(c2, c1);
return VecXor(c2, c1);
}
inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
{
const uint64x2_p c0 = VMULL_00LE(x, h);
const uint64x2_p c1 = VectorXor(VMULL_01LE(x, h), VMULL_10LE(x, h));
const uint64x2_p c1 = VecXor(VMULL_01LE(x, h), VMULL_10LE(x, h));
const uint64x2_p c2 = VMULL_11LE(x, h);
return GCM_Reduce_VMULL(c0, c1, c2, r);
@ -777,13 +777,13 @@ inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
inline uint64x2_p LoadHashKey(const byte *hashKey)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey);
const uint64x2_p key = (uint64x2_p)VecLoad(hashKey);
const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
return vec_perm(key, key, mask);
return VecPermute(key, key, mask);
#else
const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey);
const uint64x2_p key = (uint64x2_p)VecLoad(hashKey);
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
return vec_perm(key, key, mask);
return VecPermute(key, key, mask);
#endif
}
@ -798,21 +798,21 @@ void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned
for (i=0; i<tableSize-32; i+=32)
{
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
VectorStore(h, (byte*)temp);
VecStore(h, (byte*)temp);
std::memcpy(mulTable+i, temp+0, 8);
VectorStore(h1, mulTable+i+16);
VectorStore(h, mulTable+i+8);
VectorStore(h1, (byte*)temp);
VecStore(h1, mulTable+i+16);
VecStore(h, mulTable+i+8);
VecStore(h1, (byte*)temp);
std::memcpy(mulTable+i+8, temp+0, 8);
h = GCM_Multiply_VMULL(h1, h0, r);
}
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
VectorStore(h, (byte*)temp);
VecStore(h, (byte*)temp);
std::memcpy(mulTable+i, temp+0, 8);
VectorStore(h1, mulTable+i+16);
VectorStore(h, mulTable+i+8);
VectorStore(h1, (byte*)temp);
VecStore(h1, mulTable+i+16);
VecStore(h, mulTable+i+8);
VecStore(h1, (byte*)temp);
std::memcpy(mulTable+i+8, temp+0, 8);
}
@ -820,33 +820,33 @@ void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned
template <class T>
inline T SwapWords(const T& data)
{
return (T)VectorRotateLeftOctet<8>(data);
return (T)VecRotateLeftOctet<8>(data);
}
inline uint64x2_p LoadBuffer1(const byte *dataBuffer)
{
#if (CRYPTOPP_BIG_ENDIAN)
return (uint64x2_p)VectorLoad(dataBuffer);
return (uint64x2_p)VecLoad(dataBuffer);
#else
const uint64x2_p data = (uint64x2_p)VectorLoad(dataBuffer);
const uint64x2_p data = (uint64x2_p)VecLoad(dataBuffer);
const uint8x16_p mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
return vec_perm(data, data, mask);
return VecPermute(data, data, mask);
#endif
}
inline uint64x2_p LoadBuffer2(const byte *dataBuffer)
{
#if (CRYPTOPP_BIG_ENDIAN)
return (uint64x2_p)SwapWords(VectorLoadBE(dataBuffer));
return (uint64x2_p)SwapWords(VecLoadBE(dataBuffer));
#else
return (uint64x2_p)VectorLoadBE(dataBuffer);
return (uint64x2_p)VecLoadBE(dataBuffer);
#endif
}
size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
{
const uint64x2_p r = {0xe100000000000000ull, 0xc200000000000000ull};
uint64x2_p x = (uint64x2_p)VectorLoad(hbuffer);
uint64x2_p x = (uint64x2_p)VecLoad(hbuffer);
while (len >= 16)
{
@ -856,59 +856,59 @@ size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mt
while (true)
{
const uint64x2_p h0 = (uint64x2_p)VectorLoad(mtable+(i+0)*16);
const uint64x2_p h1 = (uint64x2_p)VectorLoad(mtable+(i+1)*16);
const uint64x2_p h2 = (uint64x2_p)VectorXor(h0, h1);
const uint64x2_p h0 = (uint64x2_p)VecLoad(mtable+(i+0)*16);
const uint64x2_p h1 = (uint64x2_p)VecLoad(mtable+(i+1)*16);
const uint64x2_p h2 = (uint64x2_p)VecXor(h0, h1);
if (++i == s)
{
d1 = LoadBuffer2(data);
d1 = VectorXor(d1, x);
c0 = VectorXor(c0, VMULL_00LE(d1, h0));
c2 = VectorXor(c2, VMULL_01LE(d1, h1));
d1 = VectorXor(d1, SwapWords(d1));
c1 = VectorXor(c1, VMULL_00LE(d1, h2));
d1 = VecXor(d1, x);
c0 = VecXor(c0, VMULL_00LE(d1, h0));
c2 = VecXor(c2, VMULL_01LE(d1, h1));
d1 = VecXor(d1, SwapWords(d1));
c1 = VecXor(c1, VMULL_00LE(d1, h2));
break;
}
d1 = LoadBuffer1(data+(s-i)*16-8);
c0 = VectorXor(c0, VMULL_01LE(d2, h0));
c2 = VectorXor(c2, VMULL_01LE(d1, h1));
d2 = VectorXor(d2, d1);
c1 = VectorXor(c1, VMULL_01LE(d2, h2));
c0 = VecXor(c0, VMULL_01LE(d2, h0));
c2 = VecXor(c2, VMULL_01LE(d1, h1));
d2 = VecXor(d2, d1);
c1 = VecXor(c1, VMULL_01LE(d2, h2));
if (++i == s)
{
d1 = LoadBuffer2(data);
d1 = VectorXor(d1, x);
c0 = VectorXor(c0, VMULL_10LE(d1, h0));
c2 = VectorXor(c2, VMULL_11LE(d1, h1));
d1 = VectorXor(d1, SwapWords(d1));
c1 = VectorXor(c1, VMULL_10LE(d1, h2));
d1 = VecXor(d1, x);
c0 = VecXor(c0, VMULL_10LE(d1, h0));
c2 = VecXor(c2, VMULL_11LE(d1, h1));
d1 = VecXor(d1, SwapWords(d1));
c1 = VecXor(c1, VMULL_10LE(d1, h2));
break;
}
d2 = LoadBuffer2(data+(s-i)*16-8);
c0 = VectorXor(c0, VMULL_10LE(d1, h0));
c2 = VectorXor(c2, VMULL_10LE(d2, h1));
d1 = VectorXor(d1, d2);
c1 = VectorXor(c1, VMULL_10LE(d1, h2));
c0 = VecXor(c0, VMULL_10LE(d1, h0));
c2 = VecXor(c2, VMULL_10LE(d2, h1));
d1 = VecXor(d1, d2);
c1 = VecXor(c1, VMULL_10LE(d1, h2));
}
data += s*16;
len -= s*16;
c1 = VectorXor(VectorXor(c1, c0), c2);
c1 = VecXor(VecXor(c1, c0), c2);
x = GCM_Reduce_VMULL(c0, c1, c2, r);
}
VectorStore(x, hbuffer);
VecStore(x, hbuffer);
return len;
}
void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer)
{
const uint64x2_p mask = {0x08090a0b0c0d0e0full, 0x0001020304050607ull};
VectorStore(VectorPermute(VectorLoad(hashBuffer), mask), hashBuffer);
VecStore(VecPermute(VecLoad(hashBuffer), mask), hashBuffer);
}
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE

View File

@ -439,17 +439,17 @@ using CryptoPP::uint64x2_p;
inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
{
return vec_xor(a, b);
return VecXor(a, b);
}
inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
{
return vec_add(a, b);
return VecAdd(a, b);
}
inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
{
return vec_sub(a, b);
return VecSub(a, b);
}
template <unsigned int R>
@ -479,7 +479,7 @@ inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uin
CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
CRYPTOPP_ASSERT(0);
return vec_xor(a, a);
return VecXor(a, a);
}
template <>
@ -519,7 +519,7 @@ inline uint32x4_p UnpackSIMD(const uint32x4_p& v)
{
// Should not be instantiated
CRYPTOPP_ASSERT(0);
return vec_xor(v, v);
return VecXor(v, v);
}
template <>
@ -527,7 +527,7 @@ inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v)
{
// Splat to all lanes
const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
return (uint32x4_p)vec_perm(v, v, m);
return (uint32x4_p)VecPermute(v, v, m);
}
template <>
@ -535,7 +535,7 @@ inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v)
{
// Splat to all lanes
const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
return (uint32x4_p)vec_perm(v, v, m);
return (uint32x4_p)VecPermute(v, v, m);
}
template <>
@ -543,7 +543,7 @@ inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v)
{
// Splat to all lanes
const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
return (uint32x4_p)vec_perm(v, v, m);
return (uint32x4_p)VecPermute(v, v, m);
}
template <>
@ -551,7 +551,7 @@ inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v)
{
// Splat to all lanes
const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
return (uint32x4_p)vec_perm(v, v, m);
return (uint32x4_p)VecPermute(v, v, m);
}
template <unsigned int IDX>

View File

@ -73,7 +73,7 @@ bool CPU_ProbeAltivec()
// Specifically call the Altivec loads and stores
const uint8x16_p v1 = (uint8x16_p)vec_ld(0, (byte*)b1);
const uint8x16_p v2 = (uint8x16_p)vec_ld(0, (byte*)b2);
const uint8x16_p v3 = (uint8x16_p)vec_xor(v1, v2);
const uint8x16_p v3 = (uint8x16_p)VecXor(v1, v2);
vec_st(v3, 0, b3);
result = (0 == std::memcmp(b2, b3, 16));

View File

@ -29,7 +29,7 @@
# undef bool
#endif
// VectorLoad_ALTIVEC and VectorStore_ALTIVEC are
// VecLoad_ALTIVEC and VecStore_ALTIVEC are
// too noisy on modern compilers
#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
# pragma GCC diagnostic push
@ -49,14 +49,14 @@ typedef __vector unsigned int uint32x4_p;
typedef __vector unsigned long long uint64x2_p;
#endif // _ARCH_PWR8
/// \brief Reverse a vector
/// \brief Reverse bytes in a vector
/// \tparam T vector type
/// \param src the vector
/// \returns vector
/// \details Reverse() endian swaps the bytes in a vector
/// \details VecReverse() reverses the bytes in a vector
/// \since Crypto++ 6.0
template <class T>
inline T Reverse(const T src)
inline T VecReverse(const T src)
{
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
return (T)vec_perm(src, src, mask);
@ -67,16 +67,16 @@ inline T Reverse(const T src)
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \details Loads a vector in native endian format from a byte array.
/// \details VectorLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
/// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
/// of <tt>dest</tt> is aligned, and uses <tt>vec_lvsl</tt> and <tt>vec_perm</tt>
/// otherwise.
/// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so you should
/// provide aligned memory adresses.
/// \details VectorLoad_ALTIVEC() is used automatically when POWER7 or above
/// \details VecLoad_ALTIVEC() is used automatically when POWER7 or above
/// and unaligned loads is not available.
/// \note VectorLoad does not require an aligned array.
/// \note VecLoad does not require an aligned array.
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoad_ALTIVEC(const byte src[16])
inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
{
if (IsAlignedOn(src, 16))
{
@ -96,14 +96,14 @@ inline uint32x4_p VectorLoad_ALTIVEC(const byte src[16])
/// \param src the byte array
/// \param off offset into the src byte array
/// \details Loads a vector in native endian format from a byte array.
/// \details VectorLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
/// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
/// of <tt>dest</tt> is aligned, and uses <tt>vec_lvsl</tt> and <tt>vec_perm</tt>
/// otherwise.
/// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so you should
/// provide aligned memory adresses.
/// \note VectorLoad does not require an aligned array.
/// \note VecLoad does not require an aligned array.
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoad_ALTIVEC(int off, const byte src[16])
inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
{
if (IsAlignedOn(src, 16))
{
@ -122,14 +122,14 @@ inline uint32x4_p VectorLoad_ALTIVEC(int off, const byte src[16])
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \details Loads a vector in native endian format from a byte array.
/// \details VectorLoad uses POWER7's <tt>vec_xl</tt> or
/// \details VecLoad uses POWER7's <tt>vec_xl</tt> or
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
/// \note VectorLoad does not require an aligned array.
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
/// \note VecLoad does not require an aligned array.
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoad(const byte src[16])
inline uint32x4_p VecLoad(const byte src[16])
{
#if defined(_ARCH_PWR7)
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
@ -138,7 +138,7 @@ inline uint32x4_p VectorLoad(const byte src[16])
return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
# endif
#else
return VectorLoad_ALTIVEC(src);
return VecLoad_ALTIVEC(src);
#endif
}
@ -146,14 +146,14 @@ inline uint32x4_p VectorLoad(const byte src[16])
/// \param src the byte array
/// \param off offset into the byte array
/// \details Loads a vector in native endian format from a byte array.
/// \details VectorLoad uses POWER7's <tt>vec_xl</tt> or
/// \details VecLoad uses POWER7's <tt>vec_xl</tt> or
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
/// \note VectorLoad does not require an aligned array.
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
/// \note VecLoad does not require an aligned array.
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoad(int off, const byte src[16])
inline uint32x4_p VecLoad(int off, const byte src[16])
{
#if defined(_ARCH_PWR7)
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
@ -162,48 +162,48 @@ inline uint32x4_p VectorLoad(int off, const byte src[16])
return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
# endif
#else
return VectorLoad_ALTIVEC(off, src);
return VecLoad_ALTIVEC(off, src);
#endif
}
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \details Loads a vector in native endian format from a byte array.
/// \details VectorLoad uses POWER7's <tt>vec_xl</tt> or
/// \details VecLoad uses POWER7's <tt>vec_xl</tt> or
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
/// \note VectorLoad does not require an aligned array.
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
/// \note VecLoad does not require an aligned array.
/// \since Crypto++ 8.0
inline uint32x4_p VectorLoad(const word32 src[4])
inline uint32x4_p VecLoad(const word32 src[4])
{
return VectorLoad((const byte*)src);
return VecLoad((const byte*)src);
}
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \param off offset into the byte array
/// \details Loads a vector in native endian format from a byte array.
/// \note VectorLoad does not require an aligned array.
/// \note VecLoad does not require an aligned array.
/// \since Crypto++ 8.0
inline uint32x4_p VectorLoad(int off, const word32 src[4])
inline uint32x4_p VecLoad(int off, const word32 src[4])
{
return VectorLoad(off, (const byte*)src);
return VecLoad(off, (const byte*)src);
}
/// \brief Loads a vector from a byte array
/// \param src the byte array
/// \details Loads a vector in big endian format from a byte array.
/// VectorLoadBE will swap all bytes on little endian systems.
/// \details VectorLoadBE uses POWER7's <tt>vec_xl</tt> or
/// VecLoadBE will swap all bytes on little endian systems.
/// \details VecLoadBE uses POWER7's <tt>vec_xl</tt> or
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
/// \note VectorLoadBE() does not require an aligned array.
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
/// \note VecLoadBE() does not require an aligned array.
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoadBE(const byte src[16])
inline uint32x4_p VecLoadBE(const byte src[16])
{
#if defined(_ARCH_PWR7)
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
@ -212,14 +212,14 @@ inline uint32x4_p VectorLoadBE(const byte src[16])
# if (CRYPTOPP_BIG_ENDIAN)
return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
# else
return (uint32x4_p)Reverse(vec_vsx_ld(0, (byte*)src));
return (uint32x4_p)VecReverse(vec_vsx_ld(0, (byte*)src));
# endif
# endif
#else // _ARCH_PWR7
# if (CRYPTOPP_BIG_ENDIAN)
return (uint32x4_p)VectorLoad((const byte*)src);
return (uint32x4_p)VecLoad((const byte*)src);
# else
return (uint32x4_p)Reverse(VectorLoad((const byte*)src));
return (uint32x4_p)VecReverse(VecLoad((const byte*)src));
# endif
#endif // _ARCH_PWR7
}
@ -228,15 +228,15 @@ inline uint32x4_p VectorLoadBE(const byte src[16])
/// \param src the byte array
/// \param off offset into the src byte array
/// \details Loads a vector in big endian format from a byte array.
/// VectorLoadBE will swap all bytes on little endian systems.
/// \details VectorLoadBE uses POWER7's <tt>vec_xl</tt> or
/// VecLoadBE will swap all bytes on little endian systems.
/// \details VecLoadBE uses POWER7's <tt>vec_xl</tt> or
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
/// \note VectorLoadBE does not require an aligned array.
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
/// \note VecLoadBE does not require an aligned array.
/// \since Crypto++ 6.0
inline uint32x4_p VectorLoadBE(int off, const byte src[16])
inline uint32x4_p VecLoadBE(int off, const byte src[16])
{
#if defined(_ARCH_PWR7)
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
@ -245,14 +245,14 @@ inline uint32x4_p VectorLoadBE(int off, const byte src[16])
# if (CRYPTOPP_BIG_ENDIAN)
return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
# else
return (uint32x4_p)Reverse(vec_vsx_ld(off, (byte*)src));
return (uint32x4_p)VecReverse(vec_vsx_ld(off, (byte*)src));
# endif
# endif
#else // _ARCH_PWR7
# if (CRYPTOPP_BIG_ENDIAN)
return (uint32x4_p)VectorLoad(off, (const byte*)src);
return (uint32x4_p)VecLoad(off, (const byte*)src);
# else
return (uint32x4_p)Reverse(VectorLoad(off, (const byte*)src));
return (uint32x4_p)VecReverse(VecLoad(off, (const byte*)src));
# endif
#endif // _ARCH_PWR7
}
@ -264,16 +264,16 @@ inline uint32x4_p VectorLoadBE(int off, const byte src[16])
/// \param data the vector
/// \param dest the byte array
/// \details Stores a vector in native endian format to a byte array.
/// \details VectorStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
/// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
/// memory adresses.
/// \details VectorStore_ALTIVEC() is used automatically when POWER7 or above
/// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
/// and unaligned loads is not available.
/// \note VectorStore does not require an aligned array.
/// \note VecStore does not require an aligned array.
/// \since Crypto++ 8.0
template<class T>
inline void VectorStore_ALTIVEC(const T data, byte dest[16])
inline void VecStore_ALTIVEC(const T data, byte dest[16])
{
if (IsAlignedOn(dest, 16))
{
@ -300,16 +300,16 @@ inline void VectorStore_ALTIVEC(const T data, byte dest[16])
/// \param off the byte offset into the array
/// \param dest the byte array
/// \details Stores a vector in native endian format to a byte array.
/// \details VectorStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
/// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
/// memory adresses.
/// \details VectorStore_ALTIVEC() is used automatically when POWER7 or above
/// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
/// and unaligned loads is not available.
/// \note VectorStore does not require an aligned array.
/// \note VecStore does not require an aligned array.
/// \since Crypto++ 8.0
template<class T>
inline void VectorStore_ALTIVEC(const T data, int off, byte dest[16])
inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
{
if (IsAlignedOn(dest, 16))
{
@ -335,15 +335,15 @@ inline void VectorStore_ALTIVEC(const T data, int off, byte dest[16])
/// \param data the vector
/// \param dest the byte array
/// \details Stores a vector in native endian format to a byte array.
/// \details VectorStore uses POWER7's <tt>vec_xst</tt> or
/// \details VecStore uses POWER7's <tt>vec_xst</tt> or
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
/// \note VectorStore does not require an aligned array.
/// \details VecStore_ALTIVEC() is used if POWER7 or above
/// is not available. VecStore_ALTIVEC() is relatively expensive.
/// \note VecStore does not require an aligned array.
/// \since Crypto++ 6.0
template<class T>
inline void VectorStore(const T data, byte dest[16])
inline void VecStore(const T data, byte dest[16])
{
#if defined(_ARCH_PWR7)
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
@ -352,7 +352,7 @@ inline void VectorStore(const T data, byte dest[16])
vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
# endif
#else
return VectorStore_ALTIVEC(data, 0, dest);
return VecStore_ALTIVEC(data, 0, dest);
#endif
}
@ -362,15 +362,15 @@ inline void VectorStore(const T data, byte dest[16])
/// \param off the byte offset into the array
/// \param dest the byte array
/// \details Stores a vector in native endian format to a byte array.
/// \details VectorStore uses POWER7's <tt>vec_xst</tt> or
/// \details VecStore uses POWER7's <tt>vec_xst</tt> or
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
/// \note VectorStore does not require an aligned array.
/// \details VecStore_ALTIVEC() is used if POWER7 or above
/// is not available. VecStore_ALTIVEC() is relatively expensive.
/// \note VecStore does not require an aligned array.
/// \since Crypto++ 6.0
template<class T>
inline void VectorStore(const T data, int off, byte dest[16])
inline void VecStore(const T data, int off, byte dest[16])
{
#if defined(_ARCH_PWR7)
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
@ -379,7 +379,7 @@ inline void VectorStore(const T data, int off, byte dest[16])
vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
# endif
#else
return VectorStore_ALTIVEC(data, off, dest);
return VecStore_ALTIVEC(data, off, dest);
#endif
}
@ -388,17 +388,17 @@ inline void VectorStore(const T data, int off, byte dest[16])
/// \param data the vector
/// \param dest the byte array
/// \details Stores a vector in native endian format to a byte array.
/// \details VectorStore uses POWER7's <tt>vec_xst</tt> or
/// \details VecStore uses POWER7's <tt>vec_xst</tt> or
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
/// \note VectorStore does not require an aligned array.
/// \details VecStore_ALTIVEC() is used if POWER7 or above
/// is not available. VecStore_ALTIVEC() is relatively expensive.
/// \note VecStore does not require an aligned array.
/// \since Crypto++ 8.0
template<class T>
inline void VectorStore(const T data, word32 dest[4])
inline void VecStore(const T data, word32 dest[4])
{
VectorStore((uint8x16_p)data, 0, (byte*)dest);
VecStore((uint8x16_p)data, 0, (byte*)dest);
}
/// \brief Stores a vector to a word array
@ -407,17 +407,17 @@ inline void VectorStore(const T data, word32 dest[4])
/// \param off the byte offset into the array
/// \param dest the byte array
/// \details Stores a vector in native endian format to a byte array.
/// \details VectorStore uses POWER7's <tt>vec_xst</tt> or
/// \details VecStore uses POWER7's <tt>vec_xst</tt> or
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
/// \note VectorStore does not require an aligned array.
/// \details VecStore_ALTIVEC() is used if POWER7 or above
/// is not available. VecStore_ALTIVEC() is relatively expensive.
/// \note VecStore does not require an aligned array.
/// \since Crypto++ 8.0
template<class T>
inline void VectorStore(const T data, int off, word32 dest[4])
inline void VecStore(const T data, int off, word32 dest[4])
{
VectorStore((uint8x16_p)data, off, (byte*)dest);
VecStore((uint8x16_p)data, off, (byte*)dest);
}
/// \brief Stores a vector to a byte array
@ -425,16 +425,16 @@ inline void VectorStore(const T data, int off, word32 dest[4])
/// \param src the vector
/// \param dest the byte array
/// \details Stores a vector in big endian format to a byte array.
/// VectorStoreBE will swap all bytes on little endian systems.
/// \details VectorStoreBE uses POWER7's <tt>vec_xst</tt> or
/// VecStoreBE will swap all bytes on little endian systems.
/// \details VecStoreBE uses POWER7's <tt>vec_xst</tt> or
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
/// \note VectorStoreBE does not require an aligned array.
/// \details VecStore_ALTIVEC() is used if POWER7 or above
/// is not available. VecStore_ALTIVEC() is relatively expensive.
/// \note VecStoreBE does not require an aligned array.
/// \since Crypto++ 6.0
template <class T>
inline void VectorStoreBE(const T src, byte dest[16])
inline void VecStoreBE(const T src, byte dest[16])
{
#if defined(_ARCH_PWR7)
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
@ -443,14 +443,14 @@ inline void VectorStoreBE(const T src, byte dest[16])
# if (CRYPTOPP_BIG_ENDIAN)
vec_vsx_st((uint8x16_p)src, 0, (byte*)dest);
# else
vec_vsx_st((uint8x16_p)Reverse(src), 0, (byte*)dest);
vec_vsx_st((uint8x16_p)VecReverse(src), 0, (byte*)dest);
# endif
# endif
#else // _ARCH_PWR7
# if (CRYPTOPP_BIG_ENDIAN)
VectorStore((uint8x16_p)src, (byte*)dest);
VecStore((uint8x16_p)src, (byte*)dest);
# else
VectorStore((uint8x16_p)Reverse(src), (byte*)dest);
VecStore((uint8x16_p)VecReverse(src), (byte*)dest);
# endif
#endif // _ARCH_PWR7
}
@ -461,16 +461,16 @@ inline void VectorStoreBE(const T src, byte dest[16])
/// \param off offset into the dest byte array
/// \param dest the byte array
/// \details Stores a vector in big endian format to a byte array.
/// VectorStoreBE will swap all bytes on little endian systems.
/// \details VectorStoreBE uses POWER7's <tt>vec_xst</tt> or
/// VecStoreBE will swap all bytes on little endian systems.
/// \details VecStoreBE uses POWER7's <tt>vec_xst</tt> or
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
/// an aligned memory address.
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
/// \note VectorStoreBE does not require an aligned array.
/// \details VecStore_ALTIVEC() is used if POWER7 or above
/// is not available. VecStore_ALTIVEC() is relatively expensive.
/// \note VecStoreBE does not require an aligned array.
/// \since Crypto++ 6.0
template <class T>
inline void VectorStoreBE(const T src, int off, byte dest[16])
inline void VecStoreBE(const T src, int off, byte dest[16])
{
#if defined(_ARCH_PWR7)
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
@ -479,14 +479,14 @@ inline void VectorStoreBE(const T src, int off, byte dest[16])
# if (CRYPTOPP_BIG_ENDIAN)
vec_vsx_st((uint8x16_p)src, off, (byte*)dest);
# else
vec_vsx_st((uint8x16_p)Reverse(src), off, (byte*)dest);
vec_vsx_st((uint8x16_p)VecReverse(src), off, (byte*)dest);
# endif
# endif
#else // _ARCH_PWR7
# if (CRYPTOPP_BIG_ENDIAN)
VectorStore((uint8x16_p)src, off, (byte*)dest);
VecStore((uint8x16_p)src, off, (byte*)dest);
# else
VectorStore((uint8x16_p)Reverse(src), off, (byte*)dest);
VecStore((uint8x16_p)VecReverse(src), off, (byte*)dest);
# endif
#endif // _ARCH_PWR7
}
@ -498,12 +498,12 @@ inline void VectorStoreBE(const T src, int off, byte dest[16])
/// \param vec the vector
/// \param mask vector mask
/// \returns vector
/// \details VectorPermute returns a new vector from vec based on
/// \details VecPermute returns a new vector from vec based on
/// mask. mask is an uint8x16_p type vector. The return
/// vector is the same type as vec.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorPermute(const T1 vec, const T2 mask)
inline T1 VecPermute(const T1 vec, const T2 mask)
{
return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
}
@ -515,12 +515,12 @@ inline T1 VectorPermute(const T1 vec, const T2 mask)
/// \param vec2 the second vector
/// \param mask vector mask
/// \returns vector
/// \details VectorPermute returns a new vector from vec1 and vec2
/// \details VecPermute returns a new vector from vec1 and vec2
/// based on mask. mask is an uint8x16_p type vector. The return
/// vector is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorPermute(const T1 vec1, const T1 vec2, const T2 mask)
inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
{
return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask);
}
@ -531,11 +531,11 @@ inline T1 VectorPermute(const T1 vec1, const T1 vec2, const T2 mask)
/// \param vec1 the first vector
/// \param vec2 the second vector
/// \returns vector
/// \details VectorAnd returns a new vector from vec1 and vec2. The return
/// \details VecAnd returns a new vector from vec1 and vec2. The return
/// vector is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorAnd(const T1 vec1, const T2 vec2)
inline T1 VecAnd(const T1 vec1, const T2 vec2)
{
return (T1)vec_and(vec1, (T1)vec2);
}
@ -546,11 +546,11 @@ inline T1 VectorAnd(const T1 vec1, const T2 vec2)
/// \param vec1 the first vector
/// \param vec2 the second vector
/// \returns vector
/// \details VectorOr returns a new vector from vec1 and vec2. The return
/// \details VecOr returns a new vector from vec1 and vec2. The return
/// vector is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorOr(const T1 vec1, const T2 vec2)
inline T1 VecOr(const T1 vec1, const T2 vec2)
{
return (T1)vec_or(vec1, (T1)vec2);
}
@ -561,11 +561,11 @@ inline T1 VectorOr(const T1 vec1, const T2 vec2)
/// \param vec1 the first vector
/// \param vec2 the second vector
/// \returns vector
/// \details VectorXor returns a new vector from vec1 and vec2. The return
/// \details VecXor returns a new vector from vec1 and vec2. The return
/// vector is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorXor(const T1 vec1, const T2 vec2)
inline T1 VecXor(const T1 vec1, const T2 vec2)
{
return (T1)vec_xor(vec1, (T1)vec2);
}
@ -576,12 +576,12 @@ inline T1 VectorXor(const T1 vec1, const T2 vec2)
/// \param vec1 the first vector
/// \param vec2 the second vector
/// \returns vector
/// \details VectorAdd returns a new vector from vec1 and vec2.
/// \details VecAdd returns a new vector from vec1 and vec2.
/// vec2 is cast to the same type as vec1. The return vector
/// is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorAdd(const T1 vec1, const T2 vec2)
inline T1 VecAdd(const T1 vec1, const T2 vec2)
{
return (T1)vec_add(vec1, (T1)vec2);
}
@ -591,12 +591,12 @@ inline T1 VectorAdd(const T1 vec1, const T2 vec2)
/// \tparam T2 vector type
/// \param vec1 the first vector
/// \param vec2 the second vector
/// \details VectorSub returns a new vector from vec1 and vec2.
/// \details VecSub returns a new vector from vec1 and vec2.
/// vec2 is cast to the same type as vec1. The return vector
/// is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorSub(const T1 vec1, const T2 vec2)
inline T1 VecSub(const T1 vec1, const T2 vec2)
{
return (T1)vec_sub(vec1, (T1)vec2);
}
@ -607,10 +607,10 @@ inline T1 VectorSub(const T1 vec1, const T2 vec2)
/// \param vec1 the first vector
/// \param vec2 the second vector
/// \returns vector
/// \details VectorAdd64 returns a new vector from vec1 and vec2.
/// \details VecAdd64 returns a new vector from vec1 and vec2.
/// vec1 and vec2 are added as uint64x2_p quantities.
/// \since Crypto++ 8.0
inline uint32x4_p VectorAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
{
#if defined(_ARCH_PWR8)
return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
@ -632,22 +632,22 @@ inline uint32x4_p VectorAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
/// \tparam T vector type
/// \param vec the vector
/// \returns vector
/// \details VectorShiftLeftOctet() returns a new vector after shifting the
/// \details VecShiftLeftOctet() returns a new vector after shifting the
/// concatenation of the zero vector and the source vector by the specified
/// number of bytes. The return vector is the same type as vec.
/// \details On big endian machines VectorShiftLeftOctet() is <tt>vec_sld(a, z,
/// c)</tt>. On little endian machines VectorShiftLeftOctet() is translated to
/// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
/// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
/// if on a big endian machine as shown below.
/// <pre>
/// uint8x16_p x = VectorLoad(ptr);
/// uint8x16_p y = VectorShiftLeftOctet<12>(x);
/// uint8x16_p x = VecLoad(ptr);
/// uint8x16_p y = VecShiftLeftOctet<12>(x);
/// </pre>
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
/// endian sensitive?</A> on Stack Overflow
/// \since Crypto++ 6.0
template <unsigned int C, class T>
inline T VectorShiftLeftOctet(const T vec)
inline T VecShiftLeftOctet(const T vec)
{
const T zero = {0};
if (C >= 16)
@ -675,22 +675,22 @@ inline T VectorShiftLeftOctet(const T vec)
/// \tparam T vector type
/// \param vec the vector
/// \returns vector
/// \details VectorShiftRightOctet() returns a new vector after shifting the
/// \details VecShiftRightOctet() returns a new vector after shifting the
/// concatenation of the zero vector and the source vector by the specified
/// number of bytes. The return vector is the same type as vec.
/// \details On big endian machines VectorShiftRightOctet() is <tt>vec_sld(a, z,
/// c)</tt>. On little endian machines VectorShiftRightOctet() is translated to
/// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
/// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
/// if on a big endian machine as shown below.
/// <pre>
/// uint8x16_p x = VectorLoad(ptr);
/// uint8x16_p y = VectorShiftRightOctet<12>(y);
/// uint8x16_p x = VecLoad(ptr);
/// uint8x16_p y = VecShiftRightOctet<12>(y);
/// </pre>
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
/// endian sensitive?</A> on Stack Overflow
/// \since Crypto++ 6.0
template <unsigned int C, class T>
inline T VectorShiftRightOctet(const T vec)
inline T VecShiftRightOctet(const T vec)
{
const T zero = {0};
if (C >= 16)
@ -718,14 +718,14 @@ inline T VectorShiftRightOctet(const T vec)
/// \tparam T vector type
/// \param vec the vector
/// \returns vector
/// \details VectorRotateLeftOctet() returns a new vector after rotating the
/// \details VecRotateLeftOctet() returns a new vector after rotating the
/// concatenation of the source vector with itself by the specified
/// number of bytes. The return vector is the same type as vec.
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
/// endian sensitive?</A> on Stack Overflow
/// \since Crypto++ 6.0
template <unsigned int C, class T>
inline T VectorRotateLeftOctet(const T vec)
inline T VecRotateLeftOctet(const T vec)
{
enum { R = C&0xf };
#if (CRYPTOPP_BIG_ENDIAN)
@ -740,14 +740,14 @@ inline T VectorRotateLeftOctet(const T vec)
/// \tparam T vector type
/// \param vec the vector
/// \returns vector
/// \details VectorRotateRightOctet() returns a new vector after rotating the
/// \details VecRotateRightOctet() returns a new vector after rotating the
/// concatenation of the source vector with itself by the specified
/// number of bytes. The return vector is the same type as vec.
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
/// endian sensitive?</A> on Stack Overflow
/// \since Crypto++ 6.0
template <unsigned int C, class T>
inline T VectorRotateRightOctet(const T vec)
inline T VecRotateRightOctet(const T vec)
{
enum { R = C&0xf };
#if (CRYPTOPP_BIG_ENDIAN)
@ -761,9 +761,9 @@ inline T VectorRotateRightOctet(const T vec)
/// \tparam C shift bit count
/// \param vec the vector
/// \returns vector
/// \details VectorRotateLeft rotates each element in a packed vector by bit count.
/// \details VecRotateLeft rotates each element in a packed vector by bit count.
template<unsigned int C>
inline uint32x4_p VectorRotateLeft(const uint32x4_p vec)
inline uint32x4_p VecRotateLeft(const uint32x4_p vec)
{
const uint32x4_p m = {C, C, C, C};
return vec_rl(vec, m);
@ -773,9 +773,9 @@ inline uint32x4_p VectorRotateLeft(const uint32x4_p vec)
/// \tparam C shift bit count
/// \param vec the vector
/// \returns vector
/// \details VectorRotateRight rotates each element in a packed vector by bit count.
/// \details VecRotateRight rotates each element in a packed vector by bit count.
template<unsigned int C>
inline uint32x4_p VectorRotateRight(const uint32x4_p vec)
inline uint32x4_p VecRotateRight(const uint32x4_p vec)
{
const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
return vec_rl(vec, m);
@ -787,7 +787,7 @@ inline uint32x4_p VectorRotateRight(const uint32x4_p vec)
/// \returns vector
/// \since Crypto++ 7.0
template <class T>
inline T VectorSwapWords(const T vec)
inline T VecSwapWords(const T vec)
{
return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
}
@ -796,34 +796,34 @@ inline T VectorSwapWords(const T vec)
/// \tparam T vector type
/// \param val the vector
/// \returns vector created from low dword
/// \details VectorGetLow() extracts the low dword from a vector. The low dword
/// \details VecGetLow() extracts the low dword from a vector. The low dword
/// is composed of the least significant bits and occupies bytes 8 through 15
/// when viewed as a big endian array. The return vector is the same type as
/// the original vector and padded with 0's in the most significant bit positions.
template <class T>
inline T VectorGetLow(const T val)
inline T VecGetLow(const T val)
{
//const T zero = {0};
//const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
//return (T)vec_perm(zero, val, mask);
return VectorShiftRightOctet<8>(VectorShiftLeftOctet<8>(val));
return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
}
/// \brief Extract a dword from a vector
/// \tparam T vector type
/// \param val the vector
/// \returns vector created from high dword
/// \details VectorGetHigh() extracts the high dword from a vector. The high dword
/// \details VecGetHigh() extracts the high dword from a vector. The high dword
/// is composed of the most significant bits and occupies bytes 0 through 7
/// when viewed as a big endian array. The return vector is the same type as
/// the original vector and padded with 0's in the most significant bit positions.
template <class T>
inline T VectorGetHigh(const T val)
inline T VecGetHigh(const T val)
{
//const T zero = {0};
//const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
//return (T)vec_perm(zero, val, mask);
return VectorShiftRightOctet<8>(val);
return VecShiftRightOctet<8>(val);
}
/// \brief Compare two vectors
@ -833,7 +833,7 @@ inline T VectorGetHigh(const T val)
/// \param vec2 the second vector
/// \returns true if vec1 equals vec2, false otherwise
template <class T1, class T2>
inline bool VectorEqual(const T1 vec1, const T2 vec2)
inline bool VecEqual(const T1 vec1, const T2 vec2)
{
return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
}
@ -845,7 +845,7 @@ inline bool VectorEqual(const T1 vec1, const T2 vec2)
/// \param vec2 the second vector
/// \returns true if vec1 does not equal vec2, false otherwise
template <class T1, class T2>
inline bool VectorNotEqual(const T1 vec1, const T2 vec2)
inline bool VecNotEqual(const T1 vec1, const T2 vec2)
{
return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
}
@ -859,11 +859,11 @@ inline bool VectorNotEqual(const T1 vec1, const T2 vec2)
/// \tparam T2 vector type
/// \param state the state vector
/// \param key the subkey vector
/// \details VectorEncrypt performs one round of AES encryption of state
/// \details VecEncrypt performs one round of AES encryption of state
/// using subkey key. The return vector is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorEncrypt(const T1 state, const T2 key)
inline T1 VecEncrypt(const T1 state, const T2 key)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
@ -879,11 +879,11 @@ inline T1 VectorEncrypt(const T1 state, const T2 key)
/// \tparam T2 vector type
/// \param state the state vector
/// \param key the subkey vector
/// \details VectorEncryptLast performs the final round of AES encryption
/// \details VecEncryptLast performs the final round of AES encryption
/// of state using subkey key. The return vector is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorEncryptLast(const T1 state, const T2 key)
inline T1 VecEncryptLast(const T1 state, const T2 key)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
@ -899,11 +899,11 @@ inline T1 VectorEncryptLast(const T1 state, const T2 key)
/// \tparam T2 vector type
/// \param state the state vector
/// \param key the subkey vector
/// \details VectorDecrypt performs one round of AES decryption of state
/// \details VecDecrypt performs one round of AES decryption of state
/// using subkey key. The return vector is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorDecrypt(const T1 state, const T2 key)
inline T1 VecDecrypt(const T1 state, const T2 key)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
@ -919,11 +919,11 @@ inline T1 VectorDecrypt(const T1 state, const T2 key)
/// \tparam T2 vector type
/// \param state the state vector
/// \param key the subkey vector
/// \details VectorDecryptLast performs the final round of AES decryption
/// \details VecDecryptLast performs the final round of AES decryption
/// of state using subkey key. The return vector is the same type as vec1.
/// \since Crypto++ 6.0
template <class T1, class T2>
inline T1 VectorDecryptLast(const T1 state, const T2 key)
inline T1 VecDecryptLast(const T1 state, const T2 key)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
@ -939,11 +939,11 @@ inline T1 VectorDecryptLast(const T1 state, const T2 key)
/// \tparam subfunc sub-function
/// \tparam T vector type
/// \param vec the block to transform
/// \details VectorSHA256 selects sigma0, sigma1, Sigma0, Sigma1 based on
/// \details VecSHA256 selects sigma0, sigma1, Sigma0, Sigma1 based on
/// func and subfunc. The return vector is the same type as vec.
/// \since Crypto++ 6.0
template <int func, int subfunc, class T>
inline T VectorSHA256(const T vec)
inline T VecSHA256(const T vec)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (T)__vshasigmaw((uint32x4_p)vec, func, subfunc);
@ -959,11 +959,11 @@ inline T VectorSHA256(const T vec)
/// \tparam subfunc sub-function
/// \tparam T vector type
/// \param vec the block to transform
/// \details VectorSHA512 selects sigma0, sigma1, Sigma0, Sigma1 based on
/// \details VecSHA512 selects sigma0, sigma1, Sigma0, Sigma1 based on
/// func and subfunc. The return vector is the same type as vec.
/// \since Crypto++ 6.0
template <int func, int subfunc, class T>
inline T VectorSHA512(const T vec)
inline T VecSHA512(const T vec)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (T)__vshasigmad((uint64x2_p)vec, func, subfunc);

View File

@ -214,12 +214,12 @@ bool CPU_ProbePower8()
#if defined(__xlc__) || defined(__xlC__)
const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1);
const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2);
const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add
const uint64x2_p v3 = VecAdd(v1, v2); // 64-bit add
vec_xst((uint8x16_p)v3, 0, (byte*)w3);
#else
const uint64x2_p v1 = (uint64x2_p)vec_vsx_ld(0, (byte*)w1);
const uint64x2_p v2 = (uint64x2_p)vec_vsx_ld(0, (byte*)w2);
const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add
const uint64x2_p v3 = VecAdd(v1, v2); // 64-bit add
vec_vsx_st((uint8x16_p)v3, 0, (byte*)w3);
#endif
@ -265,13 +265,13 @@ bool CPU_ProbeAES()
0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08};
byte r[16] = {255}, z[16] = {};
uint8x16_p k = (uint8x16_p)VectorLoad(0, key);
uint8x16_p s = (uint8x16_p)VectorLoad(0, state);
s = VectorEncrypt(s, k);
s = VectorEncryptLast(s, k);
s = VectorDecrypt(s, k);
s = VectorDecryptLast(s, k);
VectorStore(s, r);
uint8x16_p k = (uint8x16_p)VecLoad(0, key);
uint8x16_p s = (uint8x16_p)VecLoad(0, state);
s = VecEncrypt(s, k);
s = VecEncryptLast(s, k);
s = VecDecrypt(s, k);
s = VecDecryptLast(s, k);
VecStore(s, r);
result = (0 != std::memcmp(r, z, 16));
}
@ -697,17 +697,17 @@ static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, un
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
uint32x4_p k = VectorLoad(keys);
block = VectorXor(block, k);
uint32x4_p k = VecLoad(keys);
block = VecXor(block, k);
for (size_t i=1; i<rounds-1; i+=2)
{
block = VectorEncrypt(block, VectorLoad( i*16, keys));
block = VectorEncrypt(block, VectorLoad((i+1)*16, keys));
block = VecEncrypt(block, VecLoad( i*16, keys));
block = VecEncrypt(block, VecLoad((i+1)*16, keys));
}
block = VectorEncrypt(block, VectorLoad((rounds-1)*16, keys));
block = VectorEncryptLast(block, VectorLoad(rounds*16, keys));
block = VecEncrypt(block, VecLoad((rounds-1)*16, keys));
block = VecEncryptLast(block, VecLoad(rounds*16, keys));
}
static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -717,32 +717,32 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
uint32x4_p k = VectorLoad(keys);
block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k);
block3 = VectorXor(block3, k);
block4 = VectorXor(block4, k);
block5 = VectorXor(block5, k);
uint32x4_p k = VecLoad(keys);
block0 = VecXor(block0, k);
block1 = VecXor(block1, k);
block2 = VecXor(block2, k);
block3 = VecXor(block3, k);
block4 = VecXor(block4, k);
block5 = VecXor(block5, k);
for (size_t i=1; i<rounds; ++i)
{
k = VectorLoad(i*16, keys);
block0 = VectorEncrypt(block0, k);
block1 = VectorEncrypt(block1, k);
block2 = VectorEncrypt(block2, k);
block3 = VectorEncrypt(block3, k);
block4 = VectorEncrypt(block4, k);
block5 = VectorEncrypt(block5, k);
k = VecLoad(i*16, keys);
block0 = VecEncrypt(block0, k);
block1 = VecEncrypt(block1, k);
block2 = VecEncrypt(block2, k);
block3 = VecEncrypt(block3, k);
block4 = VecEncrypt(block4, k);
block5 = VecEncrypt(block5, k);
}
k = VectorLoad(rounds*16, keys);
block0 = VectorEncryptLast(block0, k);
block1 = VectorEncryptLast(block1, k);
block2 = VectorEncryptLast(block2, k);
block3 = VectorEncryptLast(block3, k);
block4 = VectorEncryptLast(block4, k);
block5 = VectorEncryptLast(block5, k);
k = VecLoad(rounds*16, keys);
block0 = VecEncryptLast(block0, k);
block1 = VecEncryptLast(block1, k);
block2 = VecEncryptLast(block2, k);
block3 = VecEncryptLast(block3, k);
block4 = VecEncryptLast(block4, k);
block5 = VecEncryptLast(block5, k);
}
static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
@ -750,17 +750,17 @@ static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, un
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
uint32x4_p k = VectorLoad(rounds*16, keys);
block = VectorXor(block, k);
uint32x4_p k = VecLoad(rounds*16, keys);
block = VecXor(block, k);
for (size_t i=rounds-1; i>1; i-=2)
{
block = VectorDecrypt(block, VectorLoad( i*16, keys));
block = VectorDecrypt(block, VectorLoad((i-1)*16, keys));
block = VecDecrypt(block, VecLoad( i*16, keys));
block = VecDecrypt(block, VecLoad((i-1)*16, keys));
}
block = VectorDecrypt(block, VectorLoad(16, keys));
block = VectorDecryptLast(block, VectorLoad(0, keys));
block = VecDecrypt(block, VecLoad(16, keys));
block = VecDecryptLast(block, VecLoad(0, keys));
}
static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -770,32 +770,32 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
uint32x4_p k = VectorLoad(rounds*16, keys);
block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k);
block3 = VectorXor(block3, k);
block4 = VectorXor(block4, k);
block5 = VectorXor(block5, k);
uint32x4_p k = VecLoad(rounds*16, keys);
block0 = VecXor(block0, k);
block1 = VecXor(block1, k);
block2 = VecXor(block2, k);
block3 = VecXor(block3, k);
block4 = VecXor(block4, k);
block5 = VecXor(block5, k);
for (size_t i=rounds-1; i>0; --i)
{
k = VectorLoad(i*16, keys);
block0 = VectorDecrypt(block0, k);
block1 = VectorDecrypt(block1, k);
block2 = VectorDecrypt(block2, k);
block3 = VectorDecrypt(block3, k);
block4 = VectorDecrypt(block4, k);
block5 = VectorDecrypt(block5, k);
k = VecLoad(i*16, keys);
block0 = VecDecrypt(block0, k);
block1 = VecDecrypt(block1, k);
block2 = VecDecrypt(block2, k);
block3 = VecDecrypt(block3, k);
block4 = VecDecrypt(block4, k);
block5 = VecDecrypt(block5, k);
}
k = VectorLoad(0, keys);
block0 = VectorDecryptLast(block0, k);
block1 = VectorDecryptLast(block1, k);
block2 = VectorDecryptLast(block2, k);
block3 = VectorDecryptLast(block3, k);
block4 = VectorDecryptLast(block4, k);
block5 = VectorDecryptLast(block5, k);
k = VecLoad(0, keys);
block0 = VecDecryptLast(block0, k);
block1 = VecDecryptLast(block1, k);
block2 = VecDecryptLast(block2, k);
block3 = VecDecryptLast(block3, k);
block4 = VecDecryptLast(block4, k);
block5 = VecDecryptLast(block5, k);
}
ANONYMOUS_NAMESPACE_END
@ -851,14 +851,14 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32*
{
const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rkey);
const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rkey);
vec_vsx_st(vec_perm(d1, zero, mask), 0, (uint8_t*)rkey);
vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rkey);
vec_vsx_st(VecPermute(d1, zero, mask), 0, (uint8_t*)rkey);
vec_vsx_st(VecPermute(d2, zero, mask), 16, (uint8_t*)rkey);
}
for ( ; i<rounds+1; i++, rkey+=4)
{
const uint8x16_p d = vec_vsx_ld( 0, (uint8_t*)rkey);
vec_vsx_st(vec_perm(d, zero, mask), 0, (uint8_t*)rkey);
vec_vsx_st(VecPermute(d, zero, mask), 0, (uint8_t*)rkey);
}
#endif
}

View File

@ -224,11 +224,11 @@ bool CPU_ProbeSHA256()
byte r[16], z[16] = {0};
uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
x = VectorSHA256<0,0>(x);
x = VectorSHA256<0,1>(x);
x = VectorSHA256<1,0>(x);
x = VectorSHA256<1,1>(x);
VectorStore(x, r);
x = VecSHA256<0,0>(x);
x = VecSHA256<0,1>(x);
x = VecSHA256<1,0>(x);
x = VecSHA256<1,1>(x);
VecStore(x, r);
result = (0 == std::memcmp(r, z, 16));
}
@ -268,11 +268,11 @@ bool CPU_ProbeSHA512()
byte r[16], z[16] = {0};
uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
x = VectorSHA512<0,0>(x);
x = VectorSHA512<0,1>(x);
x = VectorSHA512<1,0>(x);
x = VectorSHA512<1,1>(x);
VectorStore(x, r);
x = VecSHA512<0,0>(x);
x = VecSHA512<0,1>(x);
x = VecSHA512<1,0>(x);
x = VecSHA512<1,1>(x);
VecStore(x, r);
result = (0 == std::memcmp(r, z, 16));
}
@ -1091,7 +1091,7 @@ typedef __vector unsigned long long uint64x2_p8;
// Unaligned load
template <class T> static inline
uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
uint32x4_p8 VecLoad32x4u(const T* data, int offset)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (uint32x4_p8)vec_xl(offset, (uint8_t*)data);
@ -1102,7 +1102,7 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
// Unaligned store
template <class T> static inline
void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
void VecStore32x4u(const uint32x4_p8 val, T* data, int offset)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
@ -1114,14 +1114,14 @@ void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
// Unaligned load of a user message. The load is big-endian,
// and then the message is permuted for 32-bit words.
template <class T> static inline
uint32x4_p8 VectorLoadMsg32x4(const T* data, int offset)
uint32x4_p8 VecLoadMsg32x4(const T* data, int offset)
{
#if (CRYPTOPP_LITTLE_ENDIAN)
const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
const uint32x4_p8 r = VectorLoad32x4u(data, offset);
return (uint32x4_p8)vec_perm(r, r, mask);
const uint32x4_p8 r = VecLoad32x4u(data, offset);
return (uint32x4_p8)VecPermute(r, r, mask);
#else
return VectorLoad32x4u(data, offset);
return VecLoad32x4u(data, offset);
#endif
}
@ -1136,7 +1136,7 @@ static inline
uint32x4_p8 VectorMaj(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p8 z)
{
// The trick below is due to Andy Polyakov and Jack Lloyd
return vec_sel(y, z, vec_xor(x, y));
return vec_sel(y, z, VecXor(x, y));
}
static inline
@ -1185,7 +1185,7 @@ uint32x4_p8 VectorPack(const uint32x4_p8 a, const uint32x4_p8 b,
{
const uint8x16_p8 m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
const uint8x16_p8 m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
return vec_perm(vec_perm(a,b,m1), vec_perm(c,d,m1), m2);
return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
}
template <unsigned int R> static inline
@ -1231,8 +1231,8 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t
const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
uint32x4_p8 abcd = VectorLoad32x4u(state+0, 0);
uint32x4_p8 efgh = VectorLoad32x4u(state+4, 0);
uint32x4_p8 abcd = VecLoad32x4u(state+0, 0);
uint32x4_p8 efgh = VecLoad32x4u(state+4, 0);
uint32x4_p8 W[16], S[8], vm, vk;
size_t blocks = length / SHA256::BLOCKSIZE;
@ -1241,80 +1241,80 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t
unsigned int offset=0;
S[A] = abcd; S[E] = efgh;
S[B] = VectorShiftLeftOctet<4>(S[A]);
S[F] = VectorShiftLeftOctet<4>(S[E]);
S[C] = VectorShiftLeftOctet<4>(S[B]);
S[G] = VectorShiftLeftOctet<4>(S[F]);
S[D] = VectorShiftLeftOctet<4>(S[C]);
S[H] = VectorShiftLeftOctet<4>(S[G]);
S[B] = VecShiftLeftOctet<4>(S[A]);
S[F] = VecShiftLeftOctet<4>(S[E]);
S[C] = VecShiftLeftOctet<4>(S[B]);
S[G] = VecShiftLeftOctet<4>(S[F]);
S[D] = VecShiftLeftOctet<4>(S[C]);
S[H] = VecShiftLeftOctet<4>(S[G]);
// Rounds 0-16
vk = VectorLoad32x4u(k, offset);
vm = VectorLoadMsg32x4(m, offset);
vk = VecLoad32x4u(k, offset);
vm = VecLoadMsg32x4(m, offset);
SHA256_ROUND1<0>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<1>(W,S, vk,vm);
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<2>(W,S, vk,vm);
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<3>(W,S, vk,vm);
vk = VectorLoad32x4u(k, offset);
vm = VectorLoadMsg32x4(m, offset);
vk = VecLoad32x4u(k, offset);
vm = VecLoadMsg32x4(m, offset);
SHA256_ROUND1<4>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<5>(W,S, vk,vm);
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<6>(W,S, vk,vm);
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<7>(W,S, vk,vm);
vk = VectorLoad32x4u(k, offset);
vm = VectorLoadMsg32x4(m, offset);
vk = VecLoad32x4u(k, offset);
vm = VecLoadMsg32x4(m, offset);
SHA256_ROUND1<8>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<9>(W,S, vk,vm);
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<10>(W,S, vk,vm);
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<11>(W,S, vk,vm);
vk = VectorLoad32x4u(k, offset);
vm = VectorLoadMsg32x4(m, offset);
vk = VecLoad32x4u(k, offset);
vm = VecLoadMsg32x4(m, offset);
SHA256_ROUND1<12>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<13>(W,S, vk,vm);
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<14>(W,S, vk,vm);
vk = VectorShiftLeftOctet<4>(vk);
vm = VectorShiftLeftOctet<4>(vm);
vk = VecShiftLeftOctet<4>(vk);
vm = VecShiftLeftOctet<4>(vm);
SHA256_ROUND1<15>(W,S, vk,vm);
m += 16; // 32-bit words, not bytes
@ -1322,32 +1322,32 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t
// Rounds 16-64
for (unsigned int i=16; i<64; i+=16)
{
vk = VectorLoad32x4u(k, offset);
vk = VecLoad32x4u(k, offset);
SHA256_ROUND2<0>(W,S, vk);
SHA256_ROUND2<1>(W,S, VectorShiftLeftOctet<4>(vk));
SHA256_ROUND2<2>(W,S, VectorShiftLeftOctet<8>(vk));
SHA256_ROUND2<3>(W,S, VectorShiftLeftOctet<12>(vk));
SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
offset+=16;
vk = VectorLoad32x4u(k, offset);
vk = VecLoad32x4u(k, offset);
SHA256_ROUND2<4>(W,S, vk);
SHA256_ROUND2<5>(W,S, VectorShiftLeftOctet<4>(vk));
SHA256_ROUND2<6>(W,S, VectorShiftLeftOctet<8>(vk));
SHA256_ROUND2<7>(W,S, VectorShiftLeftOctet<12>(vk));
SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
offset+=16;
vk = VectorLoad32x4u(k, offset);
vk = VecLoad32x4u(k, offset);
SHA256_ROUND2<8>(W,S, vk);
SHA256_ROUND2<9>(W,S, VectorShiftLeftOctet<4>(vk));
SHA256_ROUND2<10>(W,S, VectorShiftLeftOctet<8>(vk));
SHA256_ROUND2<11>(W,S, VectorShiftLeftOctet<12>(vk));
SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
offset+=16;
vk = VectorLoad32x4u(k, offset);
vk = VecLoad32x4u(k, offset);
SHA256_ROUND2<12>(W,S, vk);
SHA256_ROUND2<13>(W,S, VectorShiftLeftOctet<4>(vk));
SHA256_ROUND2<14>(W,S, VectorShiftLeftOctet<8>(vk));
SHA256_ROUND2<15>(W,S, VectorShiftLeftOctet<12>(vk));
SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
offset+=16;
}
@ -1355,19 +1355,19 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t
efgh += VectorPack(S[E],S[F],S[G],S[H]);
}
VectorStore32x4u(abcd, state+0, 0);
VectorStore32x4u(efgh, state+4, 0);
VecStore32x4u(abcd, state+0, 0);
VecStore32x4u(efgh, state+4, 0);
}
static inline
uint64x2_p8 VectorPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask)
uint64x2_p8 VecPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask)
{
return (uint64x2_p8)vec_perm(val, val, mask);
return (uint64x2_p8)VecPermute(val, val, mask);
}
// Unaligned load
template <class T> static inline
uint64x2_p8 VectorLoad64x2u(const T* data, int offset)
uint64x2_p8 VecLoad64x2u(const T* data, int offset)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (uint64x2_p8)vec_xl(offset, (uint8_t*)data);
@ -1378,7 +1378,7 @@ uint64x2_p8 VectorLoad64x2u(const T* data, int offset)
// Unaligned store
template <class T> static inline
void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset)
void VecStore64x2u(const uint64x2_p8 val, T* data, int offset)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
@ -1390,13 +1390,13 @@ void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset)
// Unaligned load of a user message. The load is big-endian,
// and then the message is permuted for 32-bit words.
template <class T> static inline
uint64x2_p8 VectorLoadMsg64x2(const T* data, int offset)
uint64x2_p8 VecLoadMsg64x2(const T* data, int offset)
{
#if (CRYPTOPP_LITTLE_ENDIAN)
const uint8x16_p8 mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
return VectorPermute64x2(VectorLoad64x2u(data, offset), mask);
return VecPermute64x2(VecLoad64x2u(data, offset), mask);
#else
return VectorLoad64x2u(data, offset);
return VecLoad64x2u(data, offset);
#endif
}
@ -1411,7 +1411,7 @@ static inline
uint64x2_p8 VectorMaj(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p8 z)
{
// The trick below is due to Andy Polyakov and Jack Lloyd
return vec_sel(y, z, vec_xor(x, y));
return vec_sel(y, z, VecXor(x, y));
}
static inline
@ -1458,7 +1458,7 @@ static inline
uint64x2_p8 VectorPack(const uint64x2_p8 x, const uint64x2_p8 y)
{
const uint8x16_p8 m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
return vec_perm(x,y,m);
return VecPermute(x,y,m);
}
template <unsigned int R> static inline
@ -1504,10 +1504,10 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t
const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
uint64x2_p8 ab = VectorLoad64x2u(state+0, 0);
uint64x2_p8 cd = VectorLoad64x2u(state+2, 0);
uint64x2_p8 ef = VectorLoad64x2u(state+4, 0);
uint64x2_p8 gh = VectorLoad64x2u(state+6, 0);
uint64x2_p8 ab = VecLoad64x2u(state+0, 0);
uint64x2_p8 cd = VecLoad64x2u(state+2, 0);
uint64x2_p8 ef = VecLoad64x2u(state+4, 0);
uint64x2_p8 gh = VecLoad64x2u(state+6, 0);
uint64x2_p8 W[16], S[8], vm, vk;
size_t blocks = length / SHA512::BLOCKSIZE;
@ -1517,82 +1517,82 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t
S[A] = ab; S[C] = cd;
S[E] = ef; S[G] = gh;
S[B] = VectorShiftLeftOctet<8>(S[A]);
S[D] = VectorShiftLeftOctet<8>(S[C]);
S[F] = VectorShiftLeftOctet<8>(S[E]);
S[H] = VectorShiftLeftOctet<8>(S[G]);
S[B] = VecShiftLeftOctet<8>(S[A]);
S[D] = VecShiftLeftOctet<8>(S[C]);
S[F] = VecShiftLeftOctet<8>(S[E]);
S[H] = VecShiftLeftOctet<8>(S[G]);
// Rounds 0-16
vk = VectorLoad64x2u(k, offset);
vm = VectorLoadMsg64x2(m, offset);
vk = VecLoad64x2u(k, offset);
vm = VecLoadMsg64x2(m, offset);
SHA512_ROUND1<0>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<8>(vk);
vm = VectorShiftLeftOctet<8>(vm);
vk = VecShiftLeftOctet<8>(vk);
vm = VecShiftLeftOctet<8>(vm);
SHA512_ROUND1<1>(W,S, vk,vm);
vk = VectorLoad64x2u(k, offset);
vm = VectorLoadMsg64x2(m, offset);
vk = VecLoad64x2u(k, offset);
vm = VecLoadMsg64x2(m, offset);
SHA512_ROUND1<2>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<8>(vk);
vm = VectorShiftLeftOctet<8>(vm);
vk = VecShiftLeftOctet<8>(vk);
vm = VecShiftLeftOctet<8>(vm);
SHA512_ROUND1<3>(W,S, vk,vm);
vk = VectorLoad64x2u(k, offset);
vm = VectorLoadMsg64x2(m, offset);
vk = VecLoad64x2u(k, offset);
vm = VecLoadMsg64x2(m, offset);
SHA512_ROUND1<4>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<8>(vk);
vm = VectorShiftLeftOctet<8>(vm);
vk = VecShiftLeftOctet<8>(vk);
vm = VecShiftLeftOctet<8>(vm);
SHA512_ROUND1<5>(W,S, vk,vm);
vk = VectorLoad64x2u(k, offset);
vm = VectorLoadMsg64x2(m, offset);
vk = VecLoad64x2u(k, offset);
vm = VecLoadMsg64x2(m, offset);
SHA512_ROUND1<6>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<8>(vk);
vm = VectorShiftLeftOctet<8>(vm);
vk = VecShiftLeftOctet<8>(vk);
vm = VecShiftLeftOctet<8>(vm);
SHA512_ROUND1<7>(W,S, vk,vm);
vk = VectorLoad64x2u(k, offset);
vm = VectorLoadMsg64x2(m, offset);
vk = VecLoad64x2u(k, offset);
vm = VecLoadMsg64x2(m, offset);
SHA512_ROUND1<8>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<8>(vk);
vm = VectorShiftLeftOctet<8>(vm);
vk = VecShiftLeftOctet<8>(vk);
vm = VecShiftLeftOctet<8>(vm);
SHA512_ROUND1<9>(W,S, vk,vm);
vk = VectorLoad64x2u(k, offset);
vm = VectorLoadMsg64x2(m, offset);
vk = VecLoad64x2u(k, offset);
vm = VecLoadMsg64x2(m, offset);
SHA512_ROUND1<10>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<8>(vk);
vm = VectorShiftLeftOctet<8>(vm);
vk = VecShiftLeftOctet<8>(vk);
vm = VecShiftLeftOctet<8>(vm);
SHA512_ROUND1<11>(W,S, vk,vm);
vk = VectorLoad64x2u(k, offset);
vm = VectorLoadMsg64x2(m, offset);
vk = VecLoad64x2u(k, offset);
vm = VecLoadMsg64x2(m, offset);
SHA512_ROUND1<12>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<8>(vk);
vm = VectorShiftLeftOctet<8>(vm);
vk = VecShiftLeftOctet<8>(vk);
vm = VecShiftLeftOctet<8>(vm);
SHA512_ROUND1<13>(W,S, vk,vm);
vk = VectorLoad64x2u(k, offset);
vm = VectorLoadMsg64x2(m, offset);
vk = VecLoad64x2u(k, offset);
vm = VecLoadMsg64x2(m, offset);
SHA512_ROUND1<14>(W,S, vk,vm);
offset+=16;
vk = VectorShiftLeftOctet<8>(vk);
vm = VectorShiftLeftOctet<8>(vm);
vk = VecShiftLeftOctet<8>(vk);
vm = VecShiftLeftOctet<8>(vm);
SHA512_ROUND1<15>(W,S, vk,vm);
m += 16; // 64-bit words, not bytes
@ -1600,44 +1600,44 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t
// Rounds 16-80
for (unsigned int i=16; i<80; i+=16)
{
vk = VectorLoad64x2u(k, offset);
vk = VecLoad64x2u(k, offset);
SHA512_ROUND2<0>(W,S, vk);
SHA512_ROUND2<1>(W,S, VectorShiftLeftOctet<8>(vk));
SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
offset+=16;
vk = VectorLoad64x2u(k, offset);
vk = VecLoad64x2u(k, offset);
SHA512_ROUND2<2>(W,S, vk);
SHA512_ROUND2<3>(W,S, VectorShiftLeftOctet<8>(vk));
SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
offset+=16;
vk = VectorLoad64x2u(k, offset);
vk = VecLoad64x2u(k, offset);
SHA512_ROUND2<4>(W,S, vk);
SHA512_ROUND2<5>(W,S, VectorShiftLeftOctet<8>(vk));
SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
offset+=16;
vk = VectorLoad64x2u(k, offset);
vk = VecLoad64x2u(k, offset);
SHA512_ROUND2<6>(W,S, vk);
SHA512_ROUND2<7>(W,S, VectorShiftLeftOctet<8>(vk));
SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
offset+=16;
vk = VectorLoad64x2u(k, offset);
vk = VecLoad64x2u(k, offset);
SHA512_ROUND2<8>(W,S, vk);
SHA512_ROUND2<9>(W,S, VectorShiftLeftOctet<8>(vk));
SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
offset+=16;
vk = VectorLoad64x2u(k, offset);
vk = VecLoad64x2u(k, offset);
SHA512_ROUND2<10>(W,S, vk);
SHA512_ROUND2<11>(W,S, VectorShiftLeftOctet<8>(vk));
SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
offset+=16;
vk = VectorLoad64x2u(k, offset);
vk = VecLoad64x2u(k, offset);
SHA512_ROUND2<12>(W,S, vk);
SHA512_ROUND2<13>(W,S, VectorShiftLeftOctet<8>(vk));
SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
offset+=16;
vk = VectorLoad64x2u(k, offset);
vk = VecLoad64x2u(k, offset);
SHA512_ROUND2<14>(W,S, vk);
SHA512_ROUND2<15>(W,S, VectorShiftLeftOctet<8>(vk));
SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
offset+=16;
}
@ -1647,10 +1647,10 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t
gh += VectorPack(S[G],S[H]);
}
VectorStore64x2u(ab, state+0, 0);
VectorStore64x2u(cd, state+2, 0);
VectorStore64x2u(ef, state+4, 0);
VectorStore64x2u(gh, state+6, 0);
VecStore64x2u(ab, state+0, 0);
VecStore64x2u(cd, state+2, 0);
VecStore64x2u(ef, state+4, 0);
VecStore64x2u(gh, state+6, 0);
}
#endif // CRYPTOPP_POWER8_SHA_AVAILABLE

View File

@ -548,8 +548,9 @@ using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::uint64x2_p;
using CryptoPP::VectorAnd;
using CryptoPP::VectorXor;
using CryptoPP::VecAnd;
using CryptoPP::VecXor;
using CryptoPP::VecPermute;
// Rotate left by bit count
template<unsigned int C>
@ -569,8 +570,8 @@ CRYPTOPP_INLINE uint64x2_p RotateRight64(const uint64x2_p val)
CRYPTOPP_INLINE uint64x2_p SIMON128_f(const uint64x2_p val)
{
return VectorXor(RotateLeft64<2>(val),
VectorAnd(RotateLeft64<1>(val), RotateLeft64<8>(val)));
return VecXor(RotateLeft64<2>(val),
VecAnd(RotateLeft64<1>(val), RotateLeft64<8>(val)));
}
CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
@ -584,22 +585,22 @@ CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1);
uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2);
uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk1);
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk2);
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
}
if (rounds & 1)
{
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk);
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
std::swap(x1, y1);
}
@ -612,7 +613,7 @@ CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block = (uint32x4_p)vec_perm(x1, y1, m3);
block = (uint32x4_p)VecPermute(x1, y1, m3);
}
CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
@ -626,24 +627,24 @@ CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1);
uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2);
uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
if (rounds & 1)
{
std::swap(x1, y1);
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
y1 = VectorXor(VectorXor(y1, rk), SIMON128_f(x1));
y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk1);
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk2);
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
}
#if (CRYPTOPP_BIG_ENDIAN)
@ -655,7 +656,7 @@ CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block = (uint32x4_p)vec_perm(x1, y1, m3);
block = (uint32x4_p)VecPermute(x1, y1, m3);
}
CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -671,32 +672,32 @@ CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1);
uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2);
uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1);
uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2);
uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1);
uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2);
uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk1);
y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk1);
y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk1);
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk1);
y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk1);
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk2);
x2 = VectorXor(VectorXor(x2, SIMON128_f(y2)), rk2);
x3 = VectorXor(VectorXor(x3, SIMON128_f(y3)), rk2);
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk2);
x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk2);
}
if (rounds & 1)
{
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk);
y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk);
y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk);
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk);
y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk);
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
}
@ -709,12 +710,12 @@ CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -730,34 +731,34 @@ CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1);
uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2);
uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1);
uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2);
uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1);
uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2);
uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
if (rounds & 1)
{
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
y1 = VectorXor(VectorXor(y1, rk), SIMON128_f(x1));
y2 = VectorXor(VectorXor(y2, rk), SIMON128_f(x2));
y3 = VectorXor(VectorXor(y3, rk), SIMON128_f(x3));
y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
y2 = VecXor(VecXor(y2, rk), SIMON128_f(x2));
y3 = VecXor(VecXor(y3, rk), SIMON128_f(x3));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk1);
x2 = VectorXor(VectorXor(x2, SIMON128_f(y2)), rk1);
x3 = VectorXor(VectorXor(x3, SIMON128_f(y3)), rk1);
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk1);
x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk1);
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk2);
y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk2);
y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk2);
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk2);
y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk2);
}
#if (CRYPTOPP_BIG_ENDIAN)
@ -769,12 +770,12 @@ CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
#endif // CRYPTOPP_POWER8_AVAILABLE

View File

@ -538,10 +538,11 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::VectorAnd;
using CryptoPP::VectorXor;
using CryptoPP::VectorLoad;
using CryptoPP::VectorLoadBE;
using CryptoPP::VecAnd;
using CryptoPP::VecXor;
using CryptoPP::VecLoad;
using CryptoPP::VecLoadBE;
using CryptoPP::VecPermute;
// Rotate left by bit count
template<unsigned int C>
@ -561,8 +562,8 @@ CRYPTOPP_INLINE uint32x4_p RotateRight32(const uint32x4_p val)
CRYPTOPP_INLINE uint32x4_p SIMON64_f(const uint32x4_p val)
{
return VectorXor(RotateLeft32<2>(val),
VectorAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
return VecXor(RotateLeft32<2>(val),
VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
}
CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
@ -577,8 +578,8 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = vec_perm(block0, block1, m1);
uint32x4_p y1 = vec_perm(block0, block1, m2);
uint32x4_p x1 = VecPermute(block0, block1, m1);
uint32x4_p y1 = VecPermute(block0, block1, m2);
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
@ -587,13 +588,13 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VectorLoad(subkeys+i);
uint32x4_p rk2 = VectorLoad(subkeys+i+1);
rk1 = vec_perm(rk1, rk1, m);
rk2 = vec_perm(rk2, rk2, m);
uint32x4_p rk1 = VecLoad(subkeys+i);
uint32x4_p rk2 = VecLoad(subkeys+i+1);
rk1 = VecPermute(rk1, rk1, m);
rk2 = VecPermute(rk2, rk2, m);
#endif
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
}
if (rounds & 1)
@ -602,10 +603,10 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(subkeys+rounds-1);
rk = vec_perm(rk, rk, m);
uint32x4_p rk = VecLoad(subkeys+rounds-1);
rk = VecPermute(rk, rk, m);
#endif
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
std::swap(x1, y1);
}
@ -618,8 +619,8 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
}
CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
@ -634,8 +635,8 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = vec_perm(block0, block1, m1);
uint32x4_p y1 = vec_perm(block0, block1, m2);
uint32x4_p x1 = VecPermute(block0, block1, m1);
uint32x4_p y1 = VecPermute(block0, block1, m2);
if (rounds & 1)
{
@ -644,10 +645,10 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(subkeys+rounds-1);
rk = vec_perm(rk, rk, m);
uint32x4_p rk = VecLoad(subkeys+rounds-1);
rk = VecPermute(rk, rk, m);
#endif
y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1));
y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
rounds--;
}
@ -658,13 +659,13 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
const uint32x4_p rk2 = vec_splats(subkeys[i]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VectorLoad(subkeys+i+1);
uint32x4_p rk2 = VectorLoad(subkeys+i);
rk1 = vec_perm(rk1, rk1, m);
rk2 = vec_perm(rk2, rk2, m);
uint32x4_p rk1 = VecLoad(subkeys+i+1);
uint32x4_p rk2 = VecLoad(subkeys+i);
rk1 = VecPermute(rk1, rk1, m);
rk2 = VecPermute(rk2, rk2, m);
#endif
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2);
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
}
#if (CRYPTOPP_BIG_ENDIAN)
@ -676,8 +677,8 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
}
CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -693,12 +694,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2);
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
@ -707,18 +708,18 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VectorLoad(subkeys+i);
uint32x4_p rk2 = VectorLoad(subkeys+i+1);
rk1 = vec_perm(rk1, rk1, m);
rk2 = vec_perm(rk2, rk2, m);
uint32x4_p rk1 = VecLoad(subkeys+i);
uint32x4_p rk2 = VecLoad(subkeys+i+1);
rk1 = VecPermute(rk1, rk1, m);
rk2 = VecPermute(rk2, rk2, m);
#endif
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk1);
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk1);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2);
x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk2);
x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk2);
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
}
if (rounds & 1)
@ -727,12 +728,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(subkeys+rounds-1);
rk = vec_perm(rk, rk, m);
uint32x4_p rk = VecLoad(subkeys+rounds-1);
rk = VecPermute(rk, rk, m);
#endif
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk);
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk);
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
}
@ -745,12 +746,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -766,12 +767,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2);
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
if (rounds & 1)
{
@ -781,12 +782,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(subkeys+rounds-1);
rk = vec_perm(rk, rk, m);
uint32x4_p rk = VecLoad(subkeys+rounds-1);
rk = VecPermute(rk, rk, m);
#endif
y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1));
y2 = VectorXor(VectorXor(y2, rk), SIMON64_f(x2));
y3 = VectorXor(VectorXor(y3, rk), SIMON64_f(x3));
y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
rounds--;
}
@ -797,18 +798,18 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
const uint32x4_p rk2 = vec_splats(subkeys[i]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VectorLoad(subkeys+i+1);
uint32x4_p rk2 = VectorLoad(subkeys+i);
rk1 = vec_perm(rk1, rk1, m);
rk2 = vec_perm(rk2, rk2, m);
uint32x4_p rk1 = VecLoad(subkeys+i+1);
uint32x4_p rk2 = VecLoad(subkeys+i);
rk1 = VecPermute(rk1, rk1, m);
rk2 = VecPermute(rk2, rk2, m);
#endif
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk1);
x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk1);
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2);
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk2);
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk2);
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
}
#if (CRYPTOPP_BIG_ENDIAN)
@ -820,12 +821,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE

View File

@ -479,9 +479,10 @@ using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::uint64x2_p;
using CryptoPP::VectorAdd;
using CryptoPP::VectorSub;
using CryptoPP::VectorXor;
using CryptoPP::VecAdd;
using CryptoPP::VecSub;
using CryptoPP::VecXor;
using CryptoPP::VecPermute;
// Rotate left by bit count
template<unsigned int C>
@ -510,19 +511,19 @@ void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1);
uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2);
uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
for (int i=0; i < static_cast<int>(rounds); ++i)
{
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
x1 = RotateRight64<8>(x1);
x1 = VectorAdd(x1, y1);
x1 = VectorXor(x1, rk);
x1 = VecAdd(x1, y1);
x1 = VecXor(x1, rk);
y1 = RotateLeft64<3>(y1);
y1 = VectorXor(y1, x1);
y1 = VecXor(y1, x1);
}
#if (CRYPTOPP_BIG_ENDIAN)
@ -534,7 +535,7 @@ void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block = (uint32x4_p)vec_perm(x1, y1, m3);
block = (uint32x4_p)VecPermute(x1, y1, m3);
}
void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
@ -548,17 +549,17 @@ void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1);
uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2);
uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
y1 = VectorXor(y1, x1);
y1 = VecXor(y1, x1);
y1 = RotateRight64<3>(y1);
x1 = VectorXor(x1, rk);
x1 = VectorSub(x1, y1);
x1 = VecXor(x1, rk);
x1 = VecSub(x1, y1);
x1 = RotateLeft64<8>(x1);
}
@ -571,7 +572,7 @@ void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block = (uint32x4_p)vec_perm(x1, y1, m3);
block = (uint32x4_p)VecPermute(x1, y1, m3);
}
void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -587,12 +588,12 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1);
uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2);
uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1);
uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2);
uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1);
uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2);
uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
for (int i=0; i < static_cast<int>(rounds); ++i)
{
@ -601,19 +602,19 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
x1 = RotateRight64<8>(x1);
x2 = RotateRight64<8>(x2);
x3 = RotateRight64<8>(x3);
x1 = VectorAdd(x1, y1);
x2 = VectorAdd(x2, y2);
x3 = VectorAdd(x3, y3);
x1 = VectorXor(x1, rk);
x2 = VectorXor(x2, rk);
x3 = VectorXor(x3, rk);
x1 = VecAdd(x1, y1);
x2 = VecAdd(x2, y2);
x3 = VecAdd(x3, y3);
x1 = VecXor(x1, rk);
x2 = VecXor(x2, rk);
x3 = VecXor(x3, rk);
y1 = RotateLeft64<3>(y1);
y2 = RotateLeft64<3>(y2);
y3 = RotateLeft64<3>(y3);
y1 = VectorXor(y1, x1);
y2 = VectorXor(y2, x2);
y3 = VectorXor(y3, x3);
y1 = VecXor(y1, x1);
y2 = VecXor(y2, x2);
y3 = VecXor(y3, x3);
}
#if (CRYPTOPP_BIG_ENDIAN)
@ -625,12 +626,12 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -646,30 +647,30 @@ void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1);
uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2);
uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1);
uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2);
uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1);
uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2);
uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
y1 = VectorXor(y1, x1);
y2 = VectorXor(y2, x2);
y3 = VectorXor(y3, x3);
y1 = VecXor(y1, x1);
y2 = VecXor(y2, x2);
y3 = VecXor(y3, x3);
y1 = RotateRight64<3>(y1);
y2 = RotateRight64<3>(y2);
y3 = RotateRight64<3>(y3);
x1 = VectorXor(x1, rk);
x2 = VectorXor(x2, rk);
x3 = VectorXor(x3, rk);
x1 = VectorSub(x1, y1);
x2 = VectorSub(x2, y2);
x3 = VectorSub(x3, y3);
x1 = VecXor(x1, rk);
x2 = VecXor(x2, rk);
x3 = VecXor(x3, rk);
x1 = VecSub(x1, y1);
x2 = VecSub(x2, y2);
x3 = VecSub(x3, y3);
x1 = RotateLeft64<8>(x1);
x2 = RotateLeft64<8>(x2);
x3 = RotateLeft64<8>(x3);
@ -684,12 +685,12 @@ void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
#endif // CRYPTOPP_POWER8_AVAILABLE

View File

@ -483,10 +483,11 @@ CRYPTOPP_INLINE void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::VectorAdd;
using CryptoPP::VectorSub;
using CryptoPP::VectorXor;
using CryptoPP::VectorLoad;
using CryptoPP::VecAdd;
using CryptoPP::VecSub;
using CryptoPP::VecXor;
using CryptoPP::VecLoad;
using CryptoPP::VecPermute;
// Rotate left by bit count
template<unsigned int C>
@ -516,8 +517,8 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = vec_perm(block0, block1, m1);
uint32x4_p y1 = vec_perm(block0, block1, m2);
uint32x4_p x1 = VecPermute(block0, block1, m1);
uint32x4_p y1 = VecPermute(block0, block1, m2);
for (int i=0; i < static_cast<int>(rounds); ++i)
{
@ -526,16 +527,16 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
#else
// subkeys has extra elements so memory backs the last subkey
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(subkeys+i);
rk = vec_perm(rk, rk, m);
uint32x4_p rk = VecLoad(subkeys+i);
rk = VecPermute(rk, rk, m);
#endif
x1 = RotateRight32<8>(x1);
x1 = VectorAdd(x1, y1);
x1 = VectorXor(x1, rk);
x1 = VecAdd(x1, y1);
x1 = VecXor(x1, rk);
y1 = RotateLeft32<3>(y1);
y1 = VectorXor(y1, x1);
y1 = VecXor(y1, x1);
}
#if (CRYPTOPP_BIG_ENDIAN)
@ -547,8 +548,8 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
}
void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
@ -563,8 +564,8 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = vec_perm(block0, block1, m1);
uint32x4_p y1 = vec_perm(block0, block1, m2);
uint32x4_p x1 = VecPermute(block0, block1, m1);
uint32x4_p y1 = VecPermute(block0, block1, m2);
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
@ -573,15 +574,15 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
#else
// subkeys has extra elements so memory backs the last subkey
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(subkeys+i);
rk = vec_perm(rk, rk, m);
uint32x4_p rk = VecLoad(subkeys+i);
rk = VecPermute(rk, rk, m);
#endif
y1 = VectorXor(y1, x1);
y1 = VecXor(y1, x1);
y1 = RotateRight32<3>(y1);
x1 = VectorXor(x1, rk);
x1 = VectorSub(x1, y1);
x1 = VecXor(x1, rk);
x1 = VecSub(x1, y1);
x1 = RotateLeft32<8>(x1);
}
@ -594,8 +595,8 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
}
void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -611,12 +612,12 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2);
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
for (int i=0; i < static_cast<int>(rounds); ++i)
{
@ -625,29 +626,29 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#else
// subkeys has extra elements so memory backs the last subkey
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(subkeys+i);
rk = vec_perm(rk, rk, m);
uint32x4_p rk = VecLoad(subkeys+i);
rk = VecPermute(rk, rk, m);
#endif
x1 = RotateRight32<8>(x1);
x2 = RotateRight32<8>(x2);
x3 = RotateRight32<8>(x3);
x1 = VectorAdd(x1, y1);
x2 = VectorAdd(x2, y2);
x3 = VectorAdd(x3, y3);
x1 = VecAdd(x1, y1);
x2 = VecAdd(x2, y2);
x3 = VecAdd(x3, y3);
x1 = VectorXor(x1, rk);
x2 = VectorXor(x2, rk);
x3 = VectorXor(x3, rk);
x1 = VecXor(x1, rk);
x2 = VecXor(x2, rk);
x3 = VecXor(x3, rk);
y1 = RotateLeft32<3>(y1);
y2 = RotateLeft32<3>(y2);
y3 = RotateLeft32<3>(y3);
y1 = VectorXor(y1, x1);
y2 = VectorXor(y2, x2);
y3 = VectorXor(y3, x3);
y1 = VecXor(y1, x1);
y2 = VecXor(y2, x2);
y3 = VecXor(y3, x3);
}
#if (CRYPTOPP_BIG_ENDIAN)
@ -659,12 +660,12 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
@ -680,12 +681,12 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2);
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{
@ -694,25 +695,25 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#else
// subkeys has extra elements so memory backs the last subkey
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(subkeys+i);
rk = vec_perm(rk, rk, m);
uint32x4_p rk = VecLoad(subkeys+i);
rk = VecPermute(rk, rk, m);
#endif
y1 = VectorXor(y1, x1);
y2 = VectorXor(y2, x2);
y3 = VectorXor(y3, x3);
y1 = VecXor(y1, x1);
y2 = VecXor(y2, x2);
y3 = VecXor(y3, x3);
y1 = RotateRight32<3>(y1);
y2 = RotateRight32<3>(y2);
y3 = RotateRight32<3>(y3);
x1 = VectorXor(x1, rk);
x2 = VectorXor(x2, rk);
x3 = VectorXor(x3, rk);
x1 = VecXor(x1, rk);
x2 = VecXor(x2, rk);
x3 = VecXor(x3, rk);
x1 = VectorSub(x1, y1);
x2 = VectorSub(x2, y2);
x3 = VectorSub(x3, y3);
x1 = VecSub(x1, y1);
x2 = VecSub(x2, y2);
x3 = VecSub(x3, y3);
x1 = RotateLeft32<8>(x1);
x2 = RotateLeft32<8>(x2);
@ -728,12 +729,12 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
#endif
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE

View File

@ -1089,44 +1089,44 @@ bool TestAltivecOps()
const byte st2[16] ={21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6};
const byte st3[16] ={20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5};
VectorStore(VectorLoad(src), dest);
VecStore(VecLoad(src), dest);
pass1 = (0 == std::memcmp(src, dest, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
VectorStore(VectorLoad(src+1), dest+1);
VecStore(VecLoad(src+1), dest+1);
pass1 = (0 == std::memcmp(st1, dest+1, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
VectorStore(VectorLoad(src+2), dest+2);
VecStore(VecLoad(src+2), dest+2);
pass1 = (0 == std::memcmp(st2, dest+2, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
VectorStore(VectorLoad(src+3), dest+3);
VecStore(VecLoad(src+3), dest+3);
pass1 = (0 == std::memcmp(st3, dest+3, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
VectorStoreBE(VectorLoadBE(src), dest);
VecStoreBE(VecLoadBE(src), dest);
pass1 = (0 == std::memcmp(src, dest, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
VectorStoreBE(VectorLoadBE(src+1), dest+1);
VecStoreBE(VecLoadBE(src+1), dest+1);
pass1 = (0 == std::memcmp(st1, dest+1, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
VectorStoreBE(VectorLoadBE(src+2), dest+2);
VecStoreBE(VecLoadBE(src+2), dest+2);
pass1 = (0 == std::memcmp(st2, dest+2, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
VectorStoreBE(VectorLoadBE(src+3), dest+3);
VecStoreBE(VecLoadBE(src+3), dest+3);
pass1 = (0 == std::memcmp(st3, dest+3, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
#if (CRYPTOPP_LITTLE_ENDIAN)
VectorStore(VectorLoadBE(src), dest);
VecStore(VecLoadBE(src), dest);
pass1 = (0 != std::memcmp(src, dest, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
VectorStoreBE(VectorLoad(src), dest);
VecStoreBE(VecLoad(src), dest);
pass1 = (0 != std::memcmp(src, dest, 16)) && pass1;
CRYPTOPP_ASSERT(pass1);
#endif
@ -1143,9 +1143,9 @@ bool TestAltivecOps()
uint8x16_p val = {0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff};
pass2 = (VectorEqual(val, VectorShiftLeftOctet<0>(val))) && pass2;
pass2 = (VecEqual(val, VecShiftLeftOctet<0>(val))) && pass2;
CRYPTOPP_ASSERT(pass2);
pass2 = (VectorEqual(val, VectorShiftRightOctet<0>(val))) && pass2;
pass2 = (VecEqual(val, VecShiftRightOctet<0>(val))) && pass2;
CRYPTOPP_ASSERT(pass2);
uint8x16_p lsh1 = {0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
@ -1153,9 +1153,9 @@ bool TestAltivecOps()
uint8x16_p rsh1 = {0x00,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff};
pass2 = (VectorEqual(lsh1, VectorShiftLeftOctet<1>(val))) && pass2;
pass2 = (VecEqual(lsh1, VecShiftLeftOctet<1>(val))) && pass2;
CRYPTOPP_ASSERT(pass2);
pass2 = (VectorEqual(rsh1, VectorShiftRightOctet<1>(val))) && pass2;
pass2 = (VecEqual(rsh1, VecShiftRightOctet<1>(val))) && pass2;
CRYPTOPP_ASSERT(pass2);
uint8x16_p lsh15 = {0xff,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
@ -1163,9 +1163,9 @@ bool TestAltivecOps()
uint8x16_p rsh15 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xff};
pass2 = (VectorEqual(lsh15, VectorShiftLeftOctet<15>(val))) && pass2;
pass2 = (VecEqual(lsh15, VecShiftLeftOctet<15>(val))) && pass2;
CRYPTOPP_ASSERT(pass2);
pass2 = (VectorEqual(rsh15, VectorShiftRightOctet<15>(val))) && pass2;
pass2 = (VecEqual(rsh15, VecShiftRightOctet<15>(val))) && pass2;
CRYPTOPP_ASSERT(pass2);
uint8x16_p lsh16 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
@ -1173,9 +1173,9 @@ bool TestAltivecOps()
uint8x16_p rsh16 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00};
pass2 = (VectorEqual(lsh16, VectorShiftLeftOctet<16>(val))) && pass2;
pass2 = (VecEqual(lsh16, VecShiftLeftOctet<16>(val))) && pass2;
CRYPTOPP_ASSERT(pass2);
pass2 = (VectorEqual(rsh16, VectorShiftRightOctet<16>(val))) && pass2;
pass2 = (VecEqual(rsh16, VecShiftRightOctet<16>(val))) && pass2;
CRYPTOPP_ASSERT(pass2);
if (!pass2)
@ -1194,16 +1194,16 @@ bool TestAltivecOps()
uint8x16_p ex3 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
0x1f,0x1e,0x1d,0x1c, 0x1b,0x1a,0x19,0x18};
pass3 = VectorEqual(ex2, VectorGetLow(ex1)) && pass3;
pass3 = VecEqual(ex2, VecGetLow(ex1)) && pass3;
CRYPTOPP_ASSERT(pass3);
pass3 = VectorEqual(ex3, VectorGetHigh(ex1)) && pass3;
pass3 = VecEqual(ex3, VecGetHigh(ex1)) && pass3;
CRYPTOPP_ASSERT(pass3);
uint8x16_p ex4 = VectorShiftRightOctet<8>(VectorShiftLeftOctet<8>(ex1));
pass3 = VectorEqual(ex4, VectorGetLow(ex1)) && pass3;
uint8x16_p ex4 = VecShiftRightOctet<8>(VecShiftLeftOctet<8>(ex1));
pass3 = VecEqual(ex4, VecGetLow(ex1)) && pass3;
CRYPTOPP_ASSERT(pass3);
uint8x16_p ex5 = VectorShiftRightOctet<8>(ex1);
pass3 = VectorEqual(ex5, VectorGetHigh(ex1)) && pass3;
uint8x16_p ex5 = VecShiftRightOctet<8>(ex1);
pass3 = VecEqual(ex5, VecGetHigh(ex1)) && pass3;
CRYPTOPP_ASSERT(pass3);
if (!pass3)