mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 09:59:42 +00:00
Rename PPC vector functions from VectorFunc to VecFunc
This commit is contained in:
parent
8e5cd3637e
commit
f6e04e5f33
216
adv_simd.h
216
adv_simd.h
@ -1857,54 +1857,54 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
|
|||||||
// even harder without POWER8 due to lack of 64-bit elements.
|
// even harder without POWER8 due to lack of 64-bit elements.
|
||||||
std::memcpy(temp+LowOffset, inBlocks, 8);
|
std::memcpy(temp+LowOffset, inBlocks, 8);
|
||||||
std::memcpy(temp+HighOffset, inBlocks, 8);
|
std::memcpy(temp+HighOffset, inBlocks, 8);
|
||||||
uint32x4_p ctr = (uint32x4_p)VectorLoadBE(temp);
|
uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
|
||||||
|
|
||||||
// For 64-bit block ciphers we need to load the CTR block,
|
// For 64-bit block ciphers we need to load the CTR block,
|
||||||
// which is 8 bytes. After the dup load we have two counters
|
// which is 8 bytes. After the dup load we have two counters
|
||||||
// in the Altivec word. Then we need to increment the low ctr
|
// in the Altivec word. Then we need to increment the low ctr
|
||||||
// by 0 and the high ctr by 1.
|
// by 0 and the high ctr by 1.
|
||||||
block0 = VectorAdd(s_one, ctr);
|
block0 = VecAdd(s_one, ctr);
|
||||||
|
|
||||||
// After initial increment of {0,1} remaining counters
|
// After initial increment of {0,1} remaining counters
|
||||||
// increment by {2,2}.
|
// increment by {2,2}.
|
||||||
block1 = VectorAdd(s_two, block0);
|
block1 = VecAdd(s_two, block0);
|
||||||
block2 = VectorAdd(s_two, block1);
|
block2 = VecAdd(s_two, block1);
|
||||||
block3 = VectorAdd(s_two, block2);
|
block3 = VecAdd(s_two, block2);
|
||||||
block4 = VectorAdd(s_two, block3);
|
block4 = VecAdd(s_two, block3);
|
||||||
block5 = VectorAdd(s_two, block4);
|
block5 = VecAdd(s_two, block4);
|
||||||
|
|
||||||
// Update the counter in the caller.
|
// Update the counter in the caller.
|
||||||
const_cast<byte*>(inBlocks)[7] += 12;
|
const_cast<byte*>(inBlocks)[7] += 12;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
block0 = VectorLoadBE(inBlocks);
|
block0 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block1 = VectorLoadBE(inBlocks);
|
block1 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block2 = VectorLoadBE(inBlocks);
|
block2 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block3 = VectorLoadBE(inBlocks);
|
block3 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block4 = VectorLoadBE(inBlocks);
|
block4 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block5 = VectorLoadBE(inBlocks);
|
block5 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (xorInput)
|
if (xorInput)
|
||||||
{
|
{
|
||||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
block0 = VecXor(block0, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
block1 = VecXor(block1, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
|
block2 = VecXor(block2, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
|
block3 = VecXor(block3, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
|
block4 = VecXor(block4, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
|
block5 = VecXor(block5, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1912,31 +1912,31 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
|
|||||||
|
|
||||||
if (xorOutput)
|
if (xorOutput)
|
||||||
{
|
{
|
||||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
block0 = VecXor(block0, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
block1 = VecXor(block1, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
|
block2 = VecXor(block2, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
|
block3 = VecXor(block3, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
|
block4 = VecXor(block4, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
|
block5 = VecXor(block5, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStoreBE(block0, outBlocks);
|
VecStoreBE(block0, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block1, outBlocks);
|
VecStoreBE(block1, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block2, outBlocks);
|
VecStoreBE(block2, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block3, outBlocks);
|
VecStoreBE(block3, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block4, outBlocks);
|
VecStoreBE(block4, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block5, outBlocks);
|
VecStoreBE(block5, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
|
|
||||||
length -= 6*vsxBlockSize;
|
length -= 6*vsxBlockSize;
|
||||||
@ -1951,34 +1951,34 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
|
|||||||
// even harder without POWER8 due to lack of 64-bit elements.
|
// even harder without POWER8 due to lack of 64-bit elements.
|
||||||
std::memcpy(temp+LowOffset, inBlocks, 8);
|
std::memcpy(temp+LowOffset, inBlocks, 8);
|
||||||
std::memcpy(temp+HighOffset, inBlocks, 8);
|
std::memcpy(temp+HighOffset, inBlocks, 8);
|
||||||
uint32x4_p ctr = (uint32x4_p)VectorLoadBE(temp);
|
uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
|
||||||
|
|
||||||
// For 64-bit block ciphers we need to load the CTR block,
|
// For 64-bit block ciphers we need to load the CTR block,
|
||||||
// which is 8 bytes. After the dup load we have two counters
|
// which is 8 bytes. After the dup load we have two counters
|
||||||
// in the Altivec word. Then we need to increment the low ctr
|
// in the Altivec word. Then we need to increment the low ctr
|
||||||
// by 0 and the high ctr by 1.
|
// by 0 and the high ctr by 1.
|
||||||
block0 = VectorAdd(s_one, ctr);
|
block0 = VecAdd(s_one, ctr);
|
||||||
|
|
||||||
// After initial increment of {0,1} remaining counters
|
// After initial increment of {0,1} remaining counters
|
||||||
// increment by {2,2}.
|
// increment by {2,2}.
|
||||||
block1 = VectorAdd(s_two, block0);
|
block1 = VecAdd(s_two, block0);
|
||||||
|
|
||||||
// Update the counter in the caller.
|
// Update the counter in the caller.
|
||||||
const_cast<byte*>(inBlocks)[7] += 4;
|
const_cast<byte*>(inBlocks)[7] += 4;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
block0 = VectorLoadBE(inBlocks);
|
block0 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block1 = VectorLoadBE(inBlocks);
|
block1 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (xorInput)
|
if (xorInput)
|
||||||
{
|
{
|
||||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
block0 = VecXor(block0, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
block1 = VecXor(block1, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1986,15 +1986,15 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
|
|||||||
|
|
||||||
if (xorOutput)
|
if (xorOutput)
|
||||||
{
|
{
|
||||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
block0 = VecXor(block0, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
block1 = VecXor(block1, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStoreBE(block0, outBlocks);
|
VecStoreBE(block0, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block1, outBlocks);
|
VecStoreBE(block1, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
|
|
||||||
length -= 2*vsxBlockSize;
|
length -= 2*vsxBlockSize;
|
||||||
@ -2030,14 +2030,14 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
|
|||||||
// initialize the block then it generates warnings.
|
// initialize the block then it generates warnings.
|
||||||
std::memcpy(temp+LowOffset, inBlocks, 8);
|
std::memcpy(temp+LowOffset, inBlocks, 8);
|
||||||
std::memcpy(temp+HighOffset, inBlocks, 8); // don't care
|
std::memcpy(temp+HighOffset, inBlocks, 8); // don't care
|
||||||
block = (uint32x4_p)VectorLoadBE(temp);
|
block = (uint32x4_p)VecLoadBE(temp);
|
||||||
|
|
||||||
if (xorInput)
|
if (xorInput)
|
||||||
{
|
{
|
||||||
std::memcpy(temp+LowOffset, xorBlocks, 8);
|
std::memcpy(temp+LowOffset, xorBlocks, 8);
|
||||||
std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
|
std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
|
||||||
uint32x4_p x = (uint32x4_p)VectorLoadBE(temp);
|
uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
|
||||||
block = VectorXor(block, x);
|
block = VecXor(block, x);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the counter in the caller.
|
// Update the counter in the caller.
|
||||||
@ -2050,11 +2050,11 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
|
|||||||
{
|
{
|
||||||
std::memcpy(temp+LowOffset, xorBlocks, 8);
|
std::memcpy(temp+LowOffset, xorBlocks, 8);
|
||||||
std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
|
std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
|
||||||
uint32x4_p x = (uint32x4_p)VectorLoadBE(temp);
|
uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
|
||||||
block = VectorXor(block, x);
|
block = VecXor(block, x);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStoreBE(block, temp);
|
VecStoreBE(block, temp);
|
||||||
std::memcpy(outBlocks, temp+LowOffset, 8);
|
std::memcpy(outBlocks, temp+LowOffset, 8);
|
||||||
|
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
@ -2120,10 +2120,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
|
|||||||
|
|
||||||
if (flags & BT_InBlockIsCounter)
|
if (flags & BT_InBlockIsCounter)
|
||||||
{
|
{
|
||||||
block0 = VectorLoadBE(inBlocks);
|
block0 = VecLoadBE(inBlocks);
|
||||||
block1 = VectorAdd(block0, s_one);
|
block1 = VecAdd(block0, s_one);
|
||||||
block2 = VectorAdd(block1, s_one);
|
block2 = VecAdd(block1, s_one);
|
||||||
block3 = VectorAdd(block2, s_one);
|
block3 = VecAdd(block2, s_one);
|
||||||
|
|
||||||
// Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
|
// Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
|
||||||
// CTR_ModePolicy::OperateKeystream is wired such that after
|
// CTR_ModePolicy::OperateKeystream is wired such that after
|
||||||
@ -2137,25 +2137,25 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
block0 = VectorLoadBE(inBlocks);
|
block0 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block1 = VectorLoadBE(inBlocks);
|
block1 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block2 = VectorLoadBE(inBlocks);
|
block2 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block3 = VectorLoadBE(inBlocks);
|
block3 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (xorInput)
|
if (xorInput)
|
||||||
{
|
{
|
||||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
block0 = VecXor(block0, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
block1 = VecXor(block1, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
|
block2 = VecXor(block2, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
|
block3 = VecXor(block3, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2163,23 +2163,23 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
|
|||||||
|
|
||||||
if (xorOutput)
|
if (xorOutput)
|
||||||
{
|
{
|
||||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
block0 = VecXor(block0, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
block1 = VecXor(block1, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
|
block2 = VecXor(block2, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
|
block3 = VecXor(block3, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStoreBE(block0, outBlocks);
|
VecStoreBE(block0, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block1, outBlocks);
|
VecStoreBE(block1, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block2, outBlocks);
|
VecStoreBE(block2, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block3, outBlocks);
|
VecStoreBE(block3, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
|
|
||||||
length -= 4*blockSize;
|
length -= 4*blockSize;
|
||||||
@ -2188,10 +2188,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
|
|||||||
|
|
||||||
while (length >= blockSize)
|
while (length >= blockSize)
|
||||||
{
|
{
|
||||||
uint32x4_p block = VectorLoadBE(inBlocks);
|
uint32x4_p block = VecLoadBE(inBlocks);
|
||||||
|
|
||||||
if (xorInput)
|
if (xorInput)
|
||||||
block = VectorXor(block, VectorLoadBE(xorBlocks));
|
block = VecXor(block, VecLoadBE(xorBlocks));
|
||||||
|
|
||||||
if (flags & BT_InBlockIsCounter)
|
if (flags & BT_InBlockIsCounter)
|
||||||
const_cast<byte *>(inBlocks)[15]++;
|
const_cast<byte *>(inBlocks)[15]++;
|
||||||
@ -2199,9 +2199,9 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
|
|||||||
func1(block, subKeys, rounds);
|
func1(block, subKeys, rounds);
|
||||||
|
|
||||||
if (xorOutput)
|
if (xorOutput)
|
||||||
block = VectorXor(block, VectorLoadBE(xorBlocks));
|
block = VecXor(block, VecLoadBE(xorBlocks));
|
||||||
|
|
||||||
VectorStoreBE(block, outBlocks);
|
VecStoreBE(block, outBlocks);
|
||||||
|
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
@ -2265,12 +2265,12 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
|||||||
|
|
||||||
if (flags & BT_InBlockIsCounter)
|
if (flags & BT_InBlockIsCounter)
|
||||||
{
|
{
|
||||||
block0 = VectorLoadBE(inBlocks);
|
block0 = VecLoadBE(inBlocks);
|
||||||
block1 = VectorAdd(block0, s_one);
|
block1 = VecAdd(block0, s_one);
|
||||||
block2 = VectorAdd(block1, s_one);
|
block2 = VecAdd(block1, s_one);
|
||||||
block3 = VectorAdd(block2, s_one);
|
block3 = VecAdd(block2, s_one);
|
||||||
block4 = VectorAdd(block3, s_one);
|
block4 = VecAdd(block3, s_one);
|
||||||
block5 = VectorAdd(block4, s_one);
|
block5 = VecAdd(block4, s_one);
|
||||||
|
|
||||||
// Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
|
// Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
|
||||||
// CTR_ModePolicy::OperateKeystream is wired such that after
|
// CTR_ModePolicy::OperateKeystream is wired such that after
|
||||||
@ -2286,38 +2286,38 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
|||||||
// the issue. If the last octet was 0xFC then 4 would trigger it.
|
// the issue. If the last octet was 0xFC then 4 would trigger it.
|
||||||
// We dumb-lucked into the test with SPECK-128. The test case of
|
// We dumb-lucked into the test with SPECK-128. The test case of
|
||||||
// interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
|
// interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
|
||||||
uint8x16_p temp = VectorAdd((uint8x16_p)block5, (uint8x16_p)s_one);
|
uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
|
||||||
VectorStoreBE(temp, const_cast<byte*>(inBlocks));
|
VecStoreBE(temp, const_cast<byte*>(inBlocks));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
block0 = VectorLoadBE(inBlocks);
|
block0 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block1 = VectorLoadBE(inBlocks);
|
block1 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block2 = VectorLoadBE(inBlocks);
|
block2 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block3 = VectorLoadBE(inBlocks);
|
block3 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block4 = VectorLoadBE(inBlocks);
|
block4 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
block5 = VectorLoadBE(inBlocks);
|
block5 = VecLoadBE(inBlocks);
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (xorInput)
|
if (xorInput)
|
||||||
{
|
{
|
||||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
block0 = VecXor(block0, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
block1 = VecXor(block1, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
|
block2 = VecXor(block2, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
|
block3 = VecXor(block3, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
|
block4 = VecXor(block4, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
|
block5 = VecXor(block5, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2325,31 +2325,31 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
|||||||
|
|
||||||
if (xorOutput)
|
if (xorOutput)
|
||||||
{
|
{
|
||||||
block0 = VectorXor(block0, VectorLoadBE(xorBlocks));
|
block0 = VecXor(block0, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block1 = VectorXor(block1, VectorLoadBE(xorBlocks));
|
block1 = VecXor(block1, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block2 = VectorXor(block2, VectorLoadBE(xorBlocks));
|
block2 = VecXor(block2, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block3 = VectorXor(block3, VectorLoadBE(xorBlocks));
|
block3 = VecXor(block3, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block4 = VectorXor(block4, VectorLoadBE(xorBlocks));
|
block4 = VecXor(block4, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
block5 = VectorXor(block5, VectorLoadBE(xorBlocks));
|
block5 = VecXor(block5, VecLoadBE(xorBlocks));
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStoreBE(block0, outBlocks);
|
VecStoreBE(block0, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block1, outBlocks);
|
VecStoreBE(block1, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block2, outBlocks);
|
VecStoreBE(block2, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block3, outBlocks);
|
VecStoreBE(block3, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block4, outBlocks);
|
VecStoreBE(block4, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
VectorStoreBE(block5, outBlocks);
|
VecStoreBE(block5, outBlocks);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
|
|
||||||
length -= 6*blockSize;
|
length -= 6*blockSize;
|
||||||
@ -2358,10 +2358,10 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
|||||||
|
|
||||||
while (length >= blockSize)
|
while (length >= blockSize)
|
||||||
{
|
{
|
||||||
uint32x4_p block = VectorLoadBE(inBlocks);
|
uint32x4_p block = VecLoadBE(inBlocks);
|
||||||
|
|
||||||
if (xorInput)
|
if (xorInput)
|
||||||
block = VectorXor(block, VectorLoadBE(xorBlocks));
|
block = VecXor(block, VecLoadBE(xorBlocks));
|
||||||
|
|
||||||
if (flags & BT_InBlockIsCounter)
|
if (flags & BT_InBlockIsCounter)
|
||||||
const_cast<byte *>(inBlocks)[15]++;
|
const_cast<byte *>(inBlocks)[15]++;
|
||||||
@ -2369,9 +2369,9 @@ CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
|
|||||||
func1(block, subKeys, rounds);
|
func1(block, subKeys, rounds);
|
||||||
|
|
||||||
if (xorOutput)
|
if (xorOutput)
|
||||||
block = VectorXor(block, VectorLoadBE(xorBlocks));
|
block = VecXor(block, VecLoadBE(xorBlocks));
|
||||||
|
|
||||||
VectorStoreBE(block, outBlocks);
|
VecStoreBE(block, outBlocks);
|
||||||
|
|
||||||
inBlocks = PtrAdd(inBlocks, inIncrement);
|
inBlocks = PtrAdd(inBlocks, inIncrement);
|
||||||
outBlocks = PtrAdd(outBlocks, outIncrement);
|
outBlocks = PtrAdd(outBlocks, outIncrement);
|
||||||
|
124
blake2b_simd.cpp
124
blake2b_simd.cpp
@ -742,7 +742,7 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state)
|
|||||||
|
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||||
|
|
||||||
inline uint64x2_p VectorLoad64(const void* p)
|
inline uint64x2_p VecLoad64(const void* p)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (uint64x2_p)vec_xl(0, (uint8_t*)p);
|
return (uint64x2_p)vec_xl(0, (uint8_t*)p);
|
||||||
@ -751,18 +751,18 @@ inline uint64x2_p VectorLoad64(const void* p)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint64x2_p VectorLoad64LE(const void* p)
|
inline uint64x2_p VecLoad64LE(const void* p)
|
||||||
{
|
{
|
||||||
#if __BIG_ENDIAN__
|
#if __BIG_ENDIAN__
|
||||||
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
||||||
const uint64x2_p v = VectorLoad64(p);
|
const uint64x2_p v = VecLoad64(p);
|
||||||
return vec_perm(v, v, m);
|
return VecPermute(v, v, m);
|
||||||
#else
|
#else
|
||||||
return VectorLoad64(p);
|
return VecLoad64(p);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void VectorStore64(void* p, const uint64x2_p x)
|
inline void VecStore64(void* p, const uint64x2_p x)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
vec_xst((uint8x16_p)x,0,(uint8_t*)p);
|
vec_xst((uint8x16_p)x,0,(uint8_t*)p);
|
||||||
@ -771,18 +771,18 @@ inline void VectorStore64(void* p, const uint64x2_p x)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void VectorStore64LE(void* p, const uint64x2_p x)
|
inline void VecStore64LE(void* p, const uint64x2_p x)
|
||||||
{
|
{
|
||||||
#if __BIG_ENDIAN__
|
#if __BIG_ENDIAN__
|
||||||
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
||||||
VectorStore64(p, vec_perm(x, x, m));
|
VecStore64(p, VecPermute(x, x, m));
|
||||||
#else
|
#else
|
||||||
VectorStore64(p, x);
|
VecStore64(p, x);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int C>
|
template <unsigned int C>
|
||||||
inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
|
inline uint64x2_p VecShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
|
||||||
{
|
{
|
||||||
#if __BIG_ENDIAN__
|
#if __BIG_ENDIAN__
|
||||||
return (uint64x2_p)vec_sld((uint8x16_p)a, (uint8x16_p)b, C);
|
return (uint64x2_p)vec_sld((uint8x16_p)a, (uint8x16_p)b, C);
|
||||||
@ -791,18 +791,18 @@ inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#define vec_shl_octet(a,b,c) VectorShiftLeftOctet<c*8>(a, b)
|
#define vec_shl_octet(a,b,c) VecShiftLeftOctet<c*8>(a, b)
|
||||||
|
|
||||||
// vec_mergeh(a,b) is equivalent to vec_perm(a,b,HH_MASK); and
|
// vec_mergeh(a,b) is equivalent to VecPermute(a,b,HH_MASK); and
|
||||||
// vec_mergel(a,b) is equivalent vec_perm(a,b,LL_MASK). Benchmarks
|
// vec_mergel(a,b) is equivalent VecPermute(a,b,LL_MASK). Benchmarks
|
||||||
// show vec_mergeh and vec_mergel is faster on little-endian
|
// show vec_mergeh and vec_mergel is faster on little-endian
|
||||||
// machines by 0.4 cpb. Benchmarks show vec_perm is faster on
|
// machines by 0.4 cpb. Benchmarks show VecPermute is faster on
|
||||||
// big-endian machines by 1.5 cpb. The code that uses
|
// big-endian machines by 1.5 cpb. The code that uses
|
||||||
// vec_mergeh and vec_mergel is about 880 bytes shorter.
|
// vec_mergeh and vec_mergel is about 880 bytes shorter.
|
||||||
|
|
||||||
#if defined(__GNUC__) && (__BIG_ENDIAN__)
|
#if defined(__GNUC__) && (__BIG_ENDIAN__)
|
||||||
# define vec_merge_hi(a,b) vec_perm(a,b, HH_MASK)
|
# define vec_merge_hi(a,b) VecPermute(a,b, HH_MASK)
|
||||||
# define vec_merge_lo(a,b) vec_perm(a,b, LL_MASK)
|
# define vec_merge_lo(a,b) VecPermute(a,b, LL_MASK)
|
||||||
#else
|
#else
|
||||||
# define vec_merge_hi(a,b) vec_mergeh(a,b)
|
# define vec_merge_hi(a,b) vec_mergeh(a,b)
|
||||||
# define vec_merge_lo(a,b) vec_mergel(a,b)
|
# define vec_merge_lo(a,b) vec_mergel(a,b)
|
||||||
@ -878,12 +878,12 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
#define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_merge_hi(m4, m0); \
|
b0 = vec_merge_hi(m4, m0); \
|
||||||
b1 = vec_perm(m1, m6, HL_MASK); \
|
b1 = VecPermute(m1, m6, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_perm(m5, m1, HL_MASK); \
|
b0 = VecPermute(m5, m1, HL_MASK); \
|
||||||
b1 = vec_merge_lo(m3, m4); \
|
b1 = vec_merge_lo(m3, m4); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
@ -907,8 +907,8 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_perm(m1, m2, HL_MASK); \
|
b0 = VecPermute(m1, m2, HL_MASK); \
|
||||||
b1 = vec_perm(m2, m7, HL_MASK); \
|
b1 = VecPermute(m2, m7, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
|
||||||
@ -925,20 +925,20 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_perm(m0, m3, HL_MASK); \
|
b0 = VecPermute(m0, m3, HL_MASK); \
|
||||||
b1 = vec_perm(m2, m7, HL_MASK); \
|
b1 = VecPermute(m2, m7, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_perm(m7, m5, HL_MASK); \
|
b0 = VecPermute(m7, m5, HL_MASK); \
|
||||||
b1 = vec_perm(m3, m1, HL_MASK); \
|
b1 = VecPermute(m3, m1, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_shl_octet(m0, m6, 1); \
|
b0 = vec_shl_octet(m0, m6, 1); \
|
||||||
b1 = vec_perm(m4, m6, HL_MASK); \
|
b1 = VecPermute(m4, m6, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
|
||||||
@ -955,19 +955,19 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_perm(m2, m3, HL_MASK); \
|
b0 = VecPermute(m2, m3, HL_MASK); \
|
||||||
b1 = vec_merge_lo(m7, m0); \
|
b1 = vec_merge_lo(m7, m0); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_merge_lo(m6, m2); \
|
b0 = vec_merge_lo(m6, m2); \
|
||||||
b1 = vec_perm(m7, m4, HL_MASK); \
|
b1 = VecPermute(m7, m4, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_perm(m6, m0, HL_MASK); \
|
b0 = VecPermute(m6, m0, HL_MASK); \
|
||||||
b1 = vec_merge_hi(m7, m2); \
|
b1 = vec_merge_hi(m7, m2); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
@ -986,13 +986,13 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
#define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_merge_lo(m3, m1); \
|
b0 = vec_merge_lo(m3, m1); \
|
||||||
b1 = vec_perm(m1, m5, HL_MASK); \
|
b1 = VecPermute(m1, m5, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_merge_lo(m6, m3); \
|
b0 = vec_merge_lo(m6, m3); \
|
||||||
b1 = vec_perm(m6, m1, HL_MASK); \
|
b1 = VecPermute(m6, m1, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
|
||||||
@ -1033,7 +1033,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_perm(m1, m3, HL_MASK); \
|
b0 = VecPermute(m1, m3, HL_MASK); \
|
||||||
b1 = m2; \
|
b1 = m2; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
@ -1046,7 +1046,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
#define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
|
||||||
do { \
|
do { \
|
||||||
b0 = vec_merge_hi(m1, m2); \
|
b0 = vec_merge_hi(m1, m2); \
|
||||||
b1 = vec_perm(m3, m2, HL_MASK); \
|
b1 = VecPermute(m3, m2, HL_MASK); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
|
#define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
|
||||||
@ -1122,23 +1122,23 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
|
|
||||||
#define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
#define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||||
do { \
|
do { \
|
||||||
row1l = vec_add(vec_add(row1l, b0), row2l); \
|
row1l = VecAdd(VecAdd(row1l, b0), row2l); \
|
||||||
row1h = vec_add(vec_add(row1h, b1), row2h); \
|
row1h = VecAdd(VecAdd(row1h, b1), row2h); \
|
||||||
row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \
|
row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \
|
||||||
row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \
|
row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \
|
||||||
row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \
|
row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \
|
||||||
row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \
|
row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \
|
||||||
row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \
|
row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
#define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||||
do { \
|
do { \
|
||||||
row1l = vec_add(vec_add(row1l, b0), row2l); \
|
row1l = VecAdd(VecAdd(row1l, b0), row2l); \
|
||||||
row1h = vec_add(vec_add(row1h, b1), row2h); \
|
row1h = VecAdd(VecAdd(row1h, b1), row2h); \
|
||||||
row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \
|
row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \
|
||||||
row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \
|
row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \
|
||||||
row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \
|
row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \
|
||||||
row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \
|
row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \
|
||||||
row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \
|
row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
@ -1175,27 +1175,27 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
const uint64x2_p m0 = VectorLoad64LE(input + 00);
|
const uint64x2_p m0 = VecLoad64LE(input + 00);
|
||||||
const uint64x2_p m1 = VectorLoad64LE(input + 16);
|
const uint64x2_p m1 = VecLoad64LE(input + 16);
|
||||||
const uint64x2_p m2 = VectorLoad64LE(input + 32);
|
const uint64x2_p m2 = VecLoad64LE(input + 32);
|
||||||
const uint64x2_p m3 = VectorLoad64LE(input + 48);
|
const uint64x2_p m3 = VecLoad64LE(input + 48);
|
||||||
const uint64x2_p m4 = VectorLoad64LE(input + 64);
|
const uint64x2_p m4 = VecLoad64LE(input + 64);
|
||||||
const uint64x2_p m5 = VectorLoad64LE(input + 80);
|
const uint64x2_p m5 = VecLoad64LE(input + 80);
|
||||||
const uint64x2_p m6 = VectorLoad64LE(input + 96);
|
const uint64x2_p m6 = VecLoad64LE(input + 96);
|
||||||
const uint64x2_p m7 = VectorLoad64LE(input + 112);
|
const uint64x2_p m7 = VecLoad64LE(input + 112);
|
||||||
|
|
||||||
uint64x2_p row1l, row1h, row2l, row2h;
|
uint64x2_p row1l, row1h, row2l, row2h;
|
||||||
uint64x2_p row3l, row3h, row4l, row4h;
|
uint64x2_p row3l, row3h, row4l, row4h;
|
||||||
|
|
||||||
const uint64x2_p h0 = row1l = VectorLoad64LE(&state.h[0]);
|
const uint64x2_p h0 = row1l = VecLoad64LE(&state.h[0]);
|
||||||
const uint64x2_p h1 = row1h = VectorLoad64LE(&state.h[2]);
|
const uint64x2_p h1 = row1h = VecLoad64LE(&state.h[2]);
|
||||||
const uint64x2_p h2 = row2l = VectorLoad64LE(&state.h[4]);
|
const uint64x2_p h2 = row2l = VecLoad64LE(&state.h[4]);
|
||||||
const uint64x2_p h3 = row2h = VectorLoad64LE(&state.h[6]);
|
const uint64x2_p h3 = row2h = VecLoad64LE(&state.h[6]);
|
||||||
|
|
||||||
row3l = VectorLoad64(&BLAKE2B_IV[0]);
|
row3l = VecLoad64(&BLAKE2B_IV[0]);
|
||||||
row3h = VectorLoad64(&BLAKE2B_IV[2]);
|
row3h = VecLoad64(&BLAKE2B_IV[2]);
|
||||||
row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.tf[0]));
|
row4l = VecXor(VecLoad64(&BLAKE2B_IV[4]), VecLoad64(&state.tf[0]));
|
||||||
row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.tf[2]));
|
row4h = VecXor(VecLoad64(&BLAKE2B_IV[6]), VecLoad64(&state.tf[2]));
|
||||||
|
|
||||||
BLAKE2B_ROUND(0);
|
BLAKE2B_ROUND(0);
|
||||||
BLAKE2B_ROUND(1);
|
BLAKE2B_ROUND(1);
|
||||||
@ -1210,10 +1210,10 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
|||||||
BLAKE2B_ROUND(10);
|
BLAKE2B_ROUND(10);
|
||||||
BLAKE2B_ROUND(11);
|
BLAKE2B_ROUND(11);
|
||||||
|
|
||||||
VectorStore64LE(&state.h[0], vec_xor(h0, vec_xor(row1l, row3l)));
|
VecStore64LE(&state.h[0], VecXor(h0, VecXor(row1l, row3l)));
|
||||||
VectorStore64LE(&state.h[2], vec_xor(h1, vec_xor(row1h, row3h)));
|
VecStore64LE(&state.h[2], VecXor(h1, VecXor(row1h, row3h)));
|
||||||
VectorStore64LE(&state.h[4], vec_xor(h2, vec_xor(row2l, row4l)));
|
VecStore64LE(&state.h[4], VecXor(h2, VecXor(row2l, row4l)));
|
||||||
VectorStore64LE(&state.h[6], vec_xor(h3, vec_xor(row2h, row4h)));
|
VecStore64LE(&state.h[6], VecXor(h3, VecXor(row2h, row4h)));
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_POWER8_AVAILABLE
|
#endif // CRYPTOPP_POWER8_AVAILABLE
|
||||||
|
|
||||||
|
118
blake2s_simd.cpp
118
blake2s_simd.cpp
@ -683,34 +683,34 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
|
|||||||
|
|
||||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||||
|
|
||||||
inline uint32x4_p VectorLoad32(const void* p)
|
inline uint32x4_p VecLoad32(const void* p)
|
||||||
{
|
{
|
||||||
return VectorLoad((const word32*)p);
|
return VecLoad((const word32*)p);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint32x4_p VectorLoad32LE(const void* p)
|
inline uint32x4_p VecLoad32LE(const void* p)
|
||||||
{
|
{
|
||||||
#if __BIG_ENDIAN__
|
#if __BIG_ENDIAN__
|
||||||
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
||||||
const uint32x4_p v = VectorLoad((const word32*)p);
|
const uint32x4_p v = VecLoad((const word32*)p);
|
||||||
return vec_perm(v, v, m);
|
return VecPermute(v, v, m);
|
||||||
#else
|
#else
|
||||||
return VectorLoad((const word32*)p);
|
return VecLoad((const word32*)p);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void VectorStore32(void* p, const uint32x4_p x)
|
inline void VecStore32(void* p, const uint32x4_p x)
|
||||||
{
|
{
|
||||||
VectorStore(x, (word32*)p);
|
VecStore(x, (word32*)p);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void VectorStore32LE(void* p, const uint32x4_p x)
|
inline void VecStore32LE(void* p, const uint32x4_p x)
|
||||||
{
|
{
|
||||||
#if __BIG_ENDIAN__
|
#if __BIG_ENDIAN__
|
||||||
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
||||||
VectorStore(vec_perm(x, x, m), (word32*)p);
|
VecStore(VecPermute(x, x, m), (word32*)p);
|
||||||
#else
|
#else
|
||||||
VectorStore(x, (word32*)p);
|
VecStore(x, (word32*)p);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -718,7 +718,7 @@ template <unsigned int E1, unsigned int E2>
|
|||||||
inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b)
|
inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b)
|
||||||
{
|
{
|
||||||
// Re-index. I'd like to use something like Z=Y*4 and then
|
// Re-index. I'd like to use something like Z=Y*4 and then
|
||||||
// VectorShiftLeftOctet<Z>(b) but it crashes early Red Hat
|
// VecShiftLeftOctet<Z>(b) but it crashes early Red Hat
|
||||||
// GCC compilers.
|
// GCC compilers.
|
||||||
enum {X=E1&3, Y=E2&3};
|
enum {X=E1&3, Y=E2&3};
|
||||||
|
|
||||||
@ -729,88 +729,88 @@ inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b)
|
|||||||
if (X == 0 && Y == 0)
|
if (X == 0 && Y == 0)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, b, mask);
|
return VecPermute(a, b, mask);
|
||||||
}
|
}
|
||||||
else if (X == 0 && Y == 1)
|
else if (X == 0 && Y == 1)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<4>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
|
||||||
}
|
}
|
||||||
else if (X == 0 && Y == 2)
|
else if (X == 0 && Y == 2)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<8>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
|
||||||
}
|
}
|
||||||
else if (X == 0 && Y == 3)
|
else if (X == 0 && Y == 3)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<12>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Element 1 combinations
|
// Element 1 combinations
|
||||||
else if (X == 1 && Y == 0)
|
else if (X == 1 && Y == 0)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, b, mask);
|
return VecPermute(a, b, mask);
|
||||||
}
|
}
|
||||||
else if (X == 1 && Y == 1)
|
else if (X == 1 && Y == 1)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<4>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
|
||||||
}
|
}
|
||||||
else if (X == 1 && Y == 2)
|
else if (X == 1 && Y == 2)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<8>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
|
||||||
}
|
}
|
||||||
else if (X == 1 && Y == 3)
|
else if (X == 1 && Y == 3)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<12>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Element 2 combinations
|
// Element 2 combinations
|
||||||
else if (X == 2 && Y == 0)
|
else if (X == 2 && Y == 0)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, b, mask);
|
return VecPermute(a, b, mask);
|
||||||
}
|
}
|
||||||
else if (X == 2 && Y == 1)
|
else if (X == 2 && Y == 1)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<4>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
|
||||||
}
|
}
|
||||||
else if (X == 2 && Y == 2)
|
else if (X == 2 && Y == 2)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<8>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
|
||||||
}
|
}
|
||||||
else if (X == 2 && Y == 3)
|
else if (X == 2 && Y == 3)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<12>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Element 3 combinations
|
// Element 3 combinations
|
||||||
else if (X == 3 && Y == 0)
|
else if (X == 3 && Y == 0)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, b, mask);
|
return VecPermute(a, b, mask);
|
||||||
}
|
}
|
||||||
else if (X == 3 && Y == 1)
|
else if (X == 3 && Y == 1)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<4>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<4>(b), mask);
|
||||||
}
|
}
|
||||||
else if (X == 3 && Y == 2)
|
else if (X == 3 && Y == 2)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<8>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<8>(b), mask);
|
||||||
}
|
}
|
||||||
else if (X == 3 && Y == 3)
|
else if (X == 3 && Y == 3)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC};
|
||||||
return vec_perm(a, VectorShiftLeftOctet<12>(b), mask);
|
return VecPermute(a, VecShiftLeftOctet<12>(b), mask);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -826,7 +826,7 @@ inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b,
|
|||||||
|
|
||||||
// Power7 follows SSE2's implementation, and this is _mm_set_epi32.
|
// Power7 follows SSE2's implementation, and this is _mm_set_epi32.
|
||||||
const uint8x16_p mask = {20,21,22,23, 16,17,18,19, 4,5,6,7, 0,1,2,3};
|
const uint8x16_p mask = {20,21,22,23, 16,17,18,19, 4,5,6,7, 0,1,2,3};
|
||||||
return vec_perm(t0, t1, mask);
|
return VecPermute(t0, t1, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -835,7 +835,7 @@ uint32x4_p VectorSet32<2,0,2,0>(const uint32x4_p a, const uint32x4_p b,
|
|||||||
{
|
{
|
||||||
// a=b, c=d, mask is {2,0, 2,0}
|
// a=b, c=d, mask is {2,0, 2,0}
|
||||||
const uint8x16_p mask = {16,17,18,19, 24,25,26,27, 0,1,2,3, 8,9,10,11};
|
const uint8x16_p mask = {16,17,18,19, 24,25,26,27, 0,1,2,3, 8,9,10,11};
|
||||||
return vec_perm(a, c, mask);
|
return VecPermute(a, c, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -844,7 +844,7 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b,
|
|||||||
{
|
{
|
||||||
// a=b, c=d, mask is {3,1, 3,1}
|
// a=b, c=d, mask is {3,1, 3,1}
|
||||||
const uint8x16_p mask = {20,21,22,23, 28,29,30,31, 4,5,6,7, 12,13,14,15};
|
const uint8x16_p mask = {20,21,22,23, 28,29,30,31, 4,5,6,7, 12,13,14,15};
|
||||||
return vec_perm(a, c, mask);
|
return VecPermute(a, c, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
||||||
@ -919,25 +919,25 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
|||||||
#define BLAKE2S_LOAD_MSG_9_3(buf) buf = VectorSet32<13,3,9,15>(m13,m3,m9,m15)
|
#define BLAKE2S_LOAD_MSG_9_3(buf) buf = VectorSet32<13,3,9,15>(m13,m3,m9,m15)
|
||||||
#define BLAKE2S_LOAD_MSG_9_4(buf) buf = VectorSet32<0,12,14,11>(m0,m12,m14,m11)
|
#define BLAKE2S_LOAD_MSG_9_4(buf) buf = VectorSet32<0,12,14,11>(m0,m12,m14,m11)
|
||||||
|
|
||||||
#define vec_ror_16(x) VectorRotateRight<16>(x)
|
#define vec_ror_16(x) VecRotateRight<16>(x)
|
||||||
#define vec_ror_12(x) VectorRotateRight<12>(x)
|
#define vec_ror_12(x) VecRotateRight<12>(x)
|
||||||
#define vec_ror_8(x) VectorRotateRight<8>(x)
|
#define vec_ror_8(x) VecRotateRight<8>(x)
|
||||||
#define vec_ror_7(x) VectorRotateRight<7>(x)
|
#define vec_ror_7(x) VecRotateRight<7>(x)
|
||||||
|
|
||||||
#define BLAKE2S_G1(row1,row2,row3,row4,buf) \
|
#define BLAKE2S_G1(row1,row2,row3,row4,buf) \
|
||||||
row1 = vec_add(vec_add(row1, buf), row2); \
|
row1 = VecAdd(VecAdd(row1, buf), row2); \
|
||||||
row4 = vec_xor(row4, row1); \
|
row4 = VecXor(row4, row1); \
|
||||||
row4 = vec_ror_16(row4); \
|
row4 = vec_ror_16(row4); \
|
||||||
row3 = vec_add(row3, row4); \
|
row3 = VecAdd(row3, row4); \
|
||||||
row2 = vec_xor(row2, row3); \
|
row2 = VecXor(row2, row3); \
|
||||||
row2 = vec_ror_12(row2);
|
row2 = vec_ror_12(row2);
|
||||||
|
|
||||||
#define BLAKE2S_G2(row1,row2,row3,row4,buf) \
|
#define BLAKE2S_G2(row1,row2,row3,row4,buf) \
|
||||||
row1 = vec_add(vec_add(row1, buf), row2); \
|
row1 = VecAdd(VecAdd(row1, buf), row2); \
|
||||||
row4 = vec_xor(row4, row1); \
|
row4 = VecXor(row4, row1); \
|
||||||
row4 = vec_ror_8(row4); \
|
row4 = vec_ror_8(row4); \
|
||||||
row3 = vec_add(row3, row4); \
|
row3 = VecAdd(row3, row4); \
|
||||||
row2 = vec_xor(row2, row3); \
|
row2 = VecXor(row2, row3); \
|
||||||
row2 = vec_ror_7(row2);
|
row2 = vec_ror_7(row2);
|
||||||
|
|
||||||
const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
|
const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
|
||||||
@ -945,14 +945,14 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
|||||||
const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
|
const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
|
||||||
|
|
||||||
#define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \
|
#define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \
|
||||||
row4 = vec_perm(row4, row4, D2103_MASK); \
|
row4 = VecPermute(row4, row4, D2103_MASK); \
|
||||||
row3 = vec_perm(row3, row3, D1032_MASK); \
|
row3 = VecPermute(row3, row3, D1032_MASK); \
|
||||||
row2 = vec_perm(row2, row2, D0321_MASK);
|
row2 = VecPermute(row2, row2, D0321_MASK);
|
||||||
|
|
||||||
#define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \
|
#define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \
|
||||||
row4 = vec_perm(row4, row4, D0321_MASK); \
|
row4 = VecPermute(row4, row4, D0321_MASK); \
|
||||||
row3 = vec_perm(row3, row3, D1032_MASK); \
|
row3 = VecPermute(row3, row3, D1032_MASK); \
|
||||||
row2 = vec_perm(row2, row2, D2103_MASK);
|
row2 = VecPermute(row2, row2, D2103_MASK);
|
||||||
|
|
||||||
#define BLAKE2S_ROUND(r) \
|
#define BLAKE2S_ROUND(r) \
|
||||||
BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
|
BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
|
||||||
@ -970,15 +970,15 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
|||||||
uint32x4_p buf1, buf2, buf3, buf4;
|
uint32x4_p buf1, buf2, buf3, buf4;
|
||||||
uint32x4_p ff0, ff1;
|
uint32x4_p ff0, ff1;
|
||||||
|
|
||||||
const uint32x4_p m0 = VectorLoad32LE(input + 0);
|
const uint32x4_p m0 = VecLoad32LE(input + 0);
|
||||||
const uint32x4_p m4 = VectorLoad32LE(input + 16);
|
const uint32x4_p m4 = VecLoad32LE(input + 16);
|
||||||
const uint32x4_p m8 = VectorLoad32LE(input + 32);
|
const uint32x4_p m8 = VecLoad32LE(input + 32);
|
||||||
const uint32x4_p m12 = VectorLoad32LE(input + 48);
|
const uint32x4_p m12 = VecLoad32LE(input + 48);
|
||||||
|
|
||||||
row1 = ff0 = VectorLoad32LE(&state.h[0]);
|
row1 = ff0 = VecLoad32LE(&state.h[0]);
|
||||||
row2 = ff1 = VectorLoad32LE(&state.h[4]);
|
row2 = ff1 = VecLoad32LE(&state.h[4]);
|
||||||
row3 = VectorLoad32(&BLAKE2S_IV[0]);
|
row3 = VecLoad32(&BLAKE2S_IV[0]);
|
||||||
row4 = vec_xor(VectorLoad32(&BLAKE2S_IV[4]), VectorLoad32(&state.tf[0]));
|
row4 = VecXor(VecLoad32(&BLAKE2S_IV[4]), VecLoad32(&state.tf[0]));
|
||||||
|
|
||||||
BLAKE2S_ROUND(0);
|
BLAKE2S_ROUND(0);
|
||||||
BLAKE2S_ROUND(1);
|
BLAKE2S_ROUND(1);
|
||||||
@ -991,8 +991,8 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
|||||||
BLAKE2S_ROUND(8);
|
BLAKE2S_ROUND(8);
|
||||||
BLAKE2S_ROUND(9);
|
BLAKE2S_ROUND(9);
|
||||||
|
|
||||||
VectorStore32LE(&state.h[0], vec_xor(ff0, vec_xor(row1, row3)));
|
VecStore32LE(&state.h[0], VecXor(ff0, VecXor(row1, row3)));
|
||||||
VectorStore32LE(&state.h[4], vec_xor(ff1, vec_xor(row2, row4)));
|
VecStore32LE(&state.h[4], VecXor(ff1, VecXor(row2, row4)));
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
|
||||||
|
335
chacha_simd.cpp
335
chacha_simd.cpp
@ -206,7 +206,7 @@ inline __m128i RotateLeft<16>(const __m128i val)
|
|||||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||||
|
|
||||||
// ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec
|
// ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec
|
||||||
// is supported by using vec_ld and vec_st, and using a composite vec_add
|
// is supported by using vec_ld and vec_st, and using a composite VecAdd
|
||||||
// that supports 64-bit element adds. vec_ld and vec_st add significant
|
// that supports 64-bit element adds. vec_ld and vec_st add significant
|
||||||
// overhead when memory is not aligned. Despite the drawbacks Altivec
|
// overhead when memory is not aligned. Despite the drawbacks Altivec
|
||||||
// is profitable. The numbers for ChaCha8 are:
|
// is profitable. The numbers for ChaCha8 are:
|
||||||
@ -216,33 +216,34 @@ inline __m128i RotateLeft<16>(const __m128i val)
|
|||||||
|
|
||||||
using CryptoPP::uint8x16_p;
|
using CryptoPP::uint8x16_p;
|
||||||
using CryptoPP::uint32x4_p;
|
using CryptoPP::uint32x4_p;
|
||||||
using CryptoPP::VectorLoad;
|
using CryptoPP::VecLoad;
|
||||||
using CryptoPP::VectorStore;
|
using CryptoPP::VecStore;
|
||||||
|
using CryptoPP::VecPermute;
|
||||||
|
|
||||||
// Permutes bytes in packed 32-bit words to little endian.
|
// Permutes bytes in packed 32-bit words to little endian.
|
||||||
// State is already in proper endian order. Input and
|
// State is already in proper endian order. Input and
|
||||||
// output must be permuted during load and save.
|
// output must be permuted during load and save.
|
||||||
inline uint32x4_p VectorLoad32LE(const uint8_t src[16])
|
inline uint32x4_p VecLoad32LE(const uint8_t src[16])
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
||||||
const uint32x4_p val = VectorLoad(src);
|
const uint32x4_p val = VecLoad(src);
|
||||||
return vec_perm(val, val, mask);
|
return VecPermute(val, val, mask);
|
||||||
#else
|
#else
|
||||||
return VectorLoad(src);
|
return VecLoad(src);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Permutes bytes in packed 32-bit words to little endian.
|
// Permutes bytes in packed 32-bit words to little endian.
|
||||||
// State is already in proper endian order. Input and
|
// State is already in proper endian order. Input and
|
||||||
// output must be permuted during load and save.
|
// output must be permuted during load and save.
|
||||||
inline void VectorStore32LE(uint8_t dest[16], const uint32x4_p& val)
|
inline void VecStore32LE(uint8_t dest[16], const uint32x4_p& val)
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
||||||
VectorStore(vec_perm(val, val, mask), dest);
|
VecStore(VecPermute(val, val, mask), dest);
|
||||||
#else
|
#else
|
||||||
return VectorStore(val, dest);
|
return VecStore(val, dest);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -262,21 +263,21 @@ template <>
|
|||||||
inline uint32x4_p Shuffle<1>(const uint32x4_p& val)
|
inline uint32x4_p Shuffle<1>(const uint32x4_p& val)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
|
const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
|
||||||
return vec_perm(val, val, mask);
|
return VecPermute(val, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
inline uint32x4_p Shuffle<2>(const uint32x4_p& val)
|
inline uint32x4_p Shuffle<2>(const uint32x4_p& val)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
|
const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
|
||||||
return vec_perm(val, val, mask);
|
return VecPermute(val, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
|
inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
|
const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
|
||||||
return vec_perm(val, val, mask);
|
return VecPermute(val, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
@ -825,10 +826,10 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
|
|||||||
|
|
||||||
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||||
{
|
{
|
||||||
const uint32x4_p state0 = VectorLoad(state + 0*4);
|
const uint32x4_p state0 = VecLoad(state + 0*4);
|
||||||
const uint32x4_p state1 = VectorLoad(state + 1*4);
|
const uint32x4_p state1 = VecLoad(state + 1*4);
|
||||||
const uint32x4_p state2 = VectorLoad(state + 2*4);
|
const uint32x4_p state2 = VecLoad(state + 2*4);
|
||||||
const uint32x4_p state3 = VectorLoad(state + 3*4);
|
const uint32x4_p state3 = VecLoad(state + 3*4);
|
||||||
|
|
||||||
const uint32x4_p CTRS[3] = {
|
const uint32x4_p CTRS[3] = {
|
||||||
{1,0,0,0}, {2,0,0,0}, {3,0,0,0}
|
{1,0,0,0}, {2,0,0,0}, {3,0,0,0}
|
||||||
@ -842,79 +843,79 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
|
|||||||
uint32x4_p r1_0 = state0;
|
uint32x4_p r1_0 = state0;
|
||||||
uint32x4_p r1_1 = state1;
|
uint32x4_p r1_1 = state1;
|
||||||
uint32x4_p r1_2 = state2;
|
uint32x4_p r1_2 = state2;
|
||||||
uint32x4_p r1_3 = VectorAdd64(r0_3, CTRS[0]);
|
uint32x4_p r1_3 = VecAdd64(r0_3, CTRS[0]);
|
||||||
|
|
||||||
uint32x4_p r2_0 = state0;
|
uint32x4_p r2_0 = state0;
|
||||||
uint32x4_p r2_1 = state1;
|
uint32x4_p r2_1 = state1;
|
||||||
uint32x4_p r2_2 = state2;
|
uint32x4_p r2_2 = state2;
|
||||||
uint32x4_p r2_3 = VectorAdd64(r0_3, CTRS[1]);
|
uint32x4_p r2_3 = VecAdd64(r0_3, CTRS[1]);
|
||||||
|
|
||||||
uint32x4_p r3_0 = state0;
|
uint32x4_p r3_0 = state0;
|
||||||
uint32x4_p r3_1 = state1;
|
uint32x4_p r3_1 = state1;
|
||||||
uint32x4_p r3_2 = state2;
|
uint32x4_p r3_2 = state2;
|
||||||
uint32x4_p r3_3 = VectorAdd64(r0_3, CTRS[2]);
|
uint32x4_p r3_3 = VecAdd64(r0_3, CTRS[2]);
|
||||||
|
|
||||||
for (int i = static_cast<int>(rounds); i > 0; i -= 2)
|
for (int i = static_cast<int>(rounds); i > 0; i -= 2)
|
||||||
{
|
{
|
||||||
r0_0 = VectorAdd(r0_0, r0_1);
|
r0_0 = VecAdd(r0_0, r0_1);
|
||||||
r1_0 = VectorAdd(r1_0, r1_1);
|
r1_0 = VecAdd(r1_0, r1_1);
|
||||||
r2_0 = VectorAdd(r2_0, r2_1);
|
r2_0 = VecAdd(r2_0, r2_1);
|
||||||
r3_0 = VectorAdd(r3_0, r3_1);
|
r3_0 = VecAdd(r3_0, r3_1);
|
||||||
|
|
||||||
r0_3 = VectorXor(r0_3, r0_0);
|
r0_3 = VecXor(r0_3, r0_0);
|
||||||
r1_3 = VectorXor(r1_3, r1_0);
|
r1_3 = VecXor(r1_3, r1_0);
|
||||||
r2_3 = VectorXor(r2_3, r2_0);
|
r2_3 = VecXor(r2_3, r2_0);
|
||||||
r3_3 = VectorXor(r3_3, r3_0);
|
r3_3 = VecXor(r3_3, r3_0);
|
||||||
|
|
||||||
r0_3 = VectorRotateLeft<16>(r0_3);
|
r0_3 = VecRotateLeft<16>(r0_3);
|
||||||
r1_3 = VectorRotateLeft<16>(r1_3);
|
r1_3 = VecRotateLeft<16>(r1_3);
|
||||||
r2_3 = VectorRotateLeft<16>(r2_3);
|
r2_3 = VecRotateLeft<16>(r2_3);
|
||||||
r3_3 = VectorRotateLeft<16>(r3_3);
|
r3_3 = VecRotateLeft<16>(r3_3);
|
||||||
|
|
||||||
r0_2 = VectorAdd(r0_2, r0_3);
|
r0_2 = VecAdd(r0_2, r0_3);
|
||||||
r1_2 = VectorAdd(r1_2, r1_3);
|
r1_2 = VecAdd(r1_2, r1_3);
|
||||||
r2_2 = VectorAdd(r2_2, r2_3);
|
r2_2 = VecAdd(r2_2, r2_3);
|
||||||
r3_2 = VectorAdd(r3_2, r3_3);
|
r3_2 = VecAdd(r3_2, r3_3);
|
||||||
|
|
||||||
r0_1 = VectorXor(r0_1, r0_2);
|
r0_1 = VecXor(r0_1, r0_2);
|
||||||
r1_1 = VectorXor(r1_1, r1_2);
|
r1_1 = VecXor(r1_1, r1_2);
|
||||||
r2_1 = VectorXor(r2_1, r2_2);
|
r2_1 = VecXor(r2_1, r2_2);
|
||||||
r3_1 = VectorXor(r3_1, r3_2);
|
r3_1 = VecXor(r3_1, r3_2);
|
||||||
|
|
||||||
r0_1 = VectorRotateLeft<12>(r0_1);
|
r0_1 = VecRotateLeft<12>(r0_1);
|
||||||
r1_1 = VectorRotateLeft<12>(r1_1);
|
r1_1 = VecRotateLeft<12>(r1_1);
|
||||||
r2_1 = VectorRotateLeft<12>(r2_1);
|
r2_1 = VecRotateLeft<12>(r2_1);
|
||||||
r3_1 = VectorRotateLeft<12>(r3_1);
|
r3_1 = VecRotateLeft<12>(r3_1);
|
||||||
|
|
||||||
r0_0 = VectorAdd(r0_0, r0_1);
|
r0_0 = VecAdd(r0_0, r0_1);
|
||||||
r1_0 = VectorAdd(r1_0, r1_1);
|
r1_0 = VecAdd(r1_0, r1_1);
|
||||||
r2_0 = VectorAdd(r2_0, r2_1);
|
r2_0 = VecAdd(r2_0, r2_1);
|
||||||
r3_0 = VectorAdd(r3_0, r3_1);
|
r3_0 = VecAdd(r3_0, r3_1);
|
||||||
|
|
||||||
r0_3 = VectorXor(r0_3, r0_0);
|
r0_3 = VecXor(r0_3, r0_0);
|
||||||
r1_3 = VectorXor(r1_3, r1_0);
|
r1_3 = VecXor(r1_3, r1_0);
|
||||||
r2_3 = VectorXor(r2_3, r2_0);
|
r2_3 = VecXor(r2_3, r2_0);
|
||||||
r3_3 = VectorXor(r3_3, r3_0);
|
r3_3 = VecXor(r3_3, r3_0);
|
||||||
|
|
||||||
r0_3 = VectorRotateLeft<8>(r0_3);
|
r0_3 = VecRotateLeft<8>(r0_3);
|
||||||
r1_3 = VectorRotateLeft<8>(r1_3);
|
r1_3 = VecRotateLeft<8>(r1_3);
|
||||||
r2_3 = VectorRotateLeft<8>(r2_3);
|
r2_3 = VecRotateLeft<8>(r2_3);
|
||||||
r3_3 = VectorRotateLeft<8>(r3_3);
|
r3_3 = VecRotateLeft<8>(r3_3);
|
||||||
|
|
||||||
r0_2 = VectorAdd(r0_2, r0_3);
|
r0_2 = VecAdd(r0_2, r0_3);
|
||||||
r1_2 = VectorAdd(r1_2, r1_3);
|
r1_2 = VecAdd(r1_2, r1_3);
|
||||||
r2_2 = VectorAdd(r2_2, r2_3);
|
r2_2 = VecAdd(r2_2, r2_3);
|
||||||
r3_2 = VectorAdd(r3_2, r3_3);
|
r3_2 = VecAdd(r3_2, r3_3);
|
||||||
|
|
||||||
r0_1 = VectorXor(r0_1, r0_2);
|
r0_1 = VecXor(r0_1, r0_2);
|
||||||
r1_1 = VectorXor(r1_1, r1_2);
|
r1_1 = VecXor(r1_1, r1_2);
|
||||||
r2_1 = VectorXor(r2_1, r2_2);
|
r2_1 = VecXor(r2_1, r2_2);
|
||||||
r3_1 = VectorXor(r3_1, r3_2);
|
r3_1 = VecXor(r3_1, r3_2);
|
||||||
|
|
||||||
r0_1 = VectorRotateLeft<7>(r0_1);
|
r0_1 = VecRotateLeft<7>(r0_1);
|
||||||
r1_1 = VectorRotateLeft<7>(r1_1);
|
r1_1 = VecRotateLeft<7>(r1_1);
|
||||||
r2_1 = VectorRotateLeft<7>(r2_1);
|
r2_1 = VecRotateLeft<7>(r2_1);
|
||||||
r3_1 = VectorRotateLeft<7>(r3_1);
|
r3_1 = VecRotateLeft<7>(r3_1);
|
||||||
|
|
||||||
r0_1 = Shuffle<1>(r0_1);
|
r0_1 = Shuffle<1>(r0_1);
|
||||||
r0_2 = Shuffle<2>(r0_2);
|
r0_2 = Shuffle<2>(r0_2);
|
||||||
@ -932,65 +933,65 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
|
|||||||
r3_2 = Shuffle<2>(r3_2);
|
r3_2 = Shuffle<2>(r3_2);
|
||||||
r3_3 = Shuffle<3>(r3_3);
|
r3_3 = Shuffle<3>(r3_3);
|
||||||
|
|
||||||
r0_0 = VectorAdd(r0_0, r0_1);
|
r0_0 = VecAdd(r0_0, r0_1);
|
||||||
r1_0 = VectorAdd(r1_0, r1_1);
|
r1_0 = VecAdd(r1_0, r1_1);
|
||||||
r2_0 = VectorAdd(r2_0, r2_1);
|
r2_0 = VecAdd(r2_0, r2_1);
|
||||||
r3_0 = VectorAdd(r3_0, r3_1);
|
r3_0 = VecAdd(r3_0, r3_1);
|
||||||
|
|
||||||
r0_3 = VectorXor(r0_3, r0_0);
|
r0_3 = VecXor(r0_3, r0_0);
|
||||||
r1_3 = VectorXor(r1_3, r1_0);
|
r1_3 = VecXor(r1_3, r1_0);
|
||||||
r2_3 = VectorXor(r2_3, r2_0);
|
r2_3 = VecXor(r2_3, r2_0);
|
||||||
r3_3 = VectorXor(r3_3, r3_0);
|
r3_3 = VecXor(r3_3, r3_0);
|
||||||
|
|
||||||
r0_3 = VectorRotateLeft<16>(r0_3);
|
r0_3 = VecRotateLeft<16>(r0_3);
|
||||||
r1_3 = VectorRotateLeft<16>(r1_3);
|
r1_3 = VecRotateLeft<16>(r1_3);
|
||||||
r2_3 = VectorRotateLeft<16>(r2_3);
|
r2_3 = VecRotateLeft<16>(r2_3);
|
||||||
r3_3 = VectorRotateLeft<16>(r3_3);
|
r3_3 = VecRotateLeft<16>(r3_3);
|
||||||
|
|
||||||
r0_2 = VectorAdd(r0_2, r0_3);
|
r0_2 = VecAdd(r0_2, r0_3);
|
||||||
r1_2 = VectorAdd(r1_2, r1_3);
|
r1_2 = VecAdd(r1_2, r1_3);
|
||||||
r2_2 = VectorAdd(r2_2, r2_3);
|
r2_2 = VecAdd(r2_2, r2_3);
|
||||||
r3_2 = VectorAdd(r3_2, r3_3);
|
r3_2 = VecAdd(r3_2, r3_3);
|
||||||
|
|
||||||
r0_1 = VectorXor(r0_1, r0_2);
|
r0_1 = VecXor(r0_1, r0_2);
|
||||||
r1_1 = VectorXor(r1_1, r1_2);
|
r1_1 = VecXor(r1_1, r1_2);
|
||||||
r2_1 = VectorXor(r2_1, r2_2);
|
r2_1 = VecXor(r2_1, r2_2);
|
||||||
r3_1 = VectorXor(r3_1, r3_2);
|
r3_1 = VecXor(r3_1, r3_2);
|
||||||
|
|
||||||
r0_1 = VectorRotateLeft<12>(r0_1);
|
r0_1 = VecRotateLeft<12>(r0_1);
|
||||||
r1_1 = VectorRotateLeft<12>(r1_1);
|
r1_1 = VecRotateLeft<12>(r1_1);
|
||||||
r2_1 = VectorRotateLeft<12>(r2_1);
|
r2_1 = VecRotateLeft<12>(r2_1);
|
||||||
r3_1 = VectorRotateLeft<12>(r3_1);
|
r3_1 = VecRotateLeft<12>(r3_1);
|
||||||
|
|
||||||
r0_0 = VectorAdd(r0_0, r0_1);
|
r0_0 = VecAdd(r0_0, r0_1);
|
||||||
r1_0 = VectorAdd(r1_0, r1_1);
|
r1_0 = VecAdd(r1_0, r1_1);
|
||||||
r2_0 = VectorAdd(r2_0, r2_1);
|
r2_0 = VecAdd(r2_0, r2_1);
|
||||||
r3_0 = VectorAdd(r3_0, r3_1);
|
r3_0 = VecAdd(r3_0, r3_1);
|
||||||
|
|
||||||
r0_3 = VectorXor(r0_3, r0_0);
|
r0_3 = VecXor(r0_3, r0_0);
|
||||||
r1_3 = VectorXor(r1_3, r1_0);
|
r1_3 = VecXor(r1_3, r1_0);
|
||||||
r2_3 = VectorXor(r2_3, r2_0);
|
r2_3 = VecXor(r2_3, r2_0);
|
||||||
r3_3 = VectorXor(r3_3, r3_0);
|
r3_3 = VecXor(r3_3, r3_0);
|
||||||
|
|
||||||
r0_3 = VectorRotateLeft<8>(r0_3);
|
r0_3 = VecRotateLeft<8>(r0_3);
|
||||||
r1_3 = VectorRotateLeft<8>(r1_3);
|
r1_3 = VecRotateLeft<8>(r1_3);
|
||||||
r2_3 = VectorRotateLeft<8>(r2_3);
|
r2_3 = VecRotateLeft<8>(r2_3);
|
||||||
r3_3 = VectorRotateLeft<8>(r3_3);
|
r3_3 = VecRotateLeft<8>(r3_3);
|
||||||
|
|
||||||
r0_2 = VectorAdd(r0_2, r0_3);
|
r0_2 = VecAdd(r0_2, r0_3);
|
||||||
r1_2 = VectorAdd(r1_2, r1_3);
|
r1_2 = VecAdd(r1_2, r1_3);
|
||||||
r2_2 = VectorAdd(r2_2, r2_3);
|
r2_2 = VecAdd(r2_2, r2_3);
|
||||||
r3_2 = VectorAdd(r3_2, r3_3);
|
r3_2 = VecAdd(r3_2, r3_3);
|
||||||
|
|
||||||
r0_1 = VectorXor(r0_1, r0_2);
|
r0_1 = VecXor(r0_1, r0_2);
|
||||||
r1_1 = VectorXor(r1_1, r1_2);
|
r1_1 = VecXor(r1_1, r1_2);
|
||||||
r2_1 = VectorXor(r2_1, r2_2);
|
r2_1 = VecXor(r2_1, r2_2);
|
||||||
r3_1 = VectorXor(r3_1, r3_2);
|
r3_1 = VecXor(r3_1, r3_2);
|
||||||
|
|
||||||
r0_1 = VectorRotateLeft<7>(r0_1);
|
r0_1 = VecRotateLeft<7>(r0_1);
|
||||||
r1_1 = VectorRotateLeft<7>(r1_1);
|
r1_1 = VecRotateLeft<7>(r1_1);
|
||||||
r2_1 = VectorRotateLeft<7>(r2_1);
|
r2_1 = VecRotateLeft<7>(r2_1);
|
||||||
r3_1 = VectorRotateLeft<7>(r3_1);
|
r3_1 = VecRotateLeft<7>(r3_1);
|
||||||
|
|
||||||
r0_1 = Shuffle<3>(r0_1);
|
r0_1 = Shuffle<3>(r0_1);
|
||||||
r0_2 = Shuffle<2>(r0_2);
|
r0_2 = Shuffle<2>(r0_2);
|
||||||
@ -1009,80 +1010,80 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
|
|||||||
r3_3 = Shuffle<1>(r3_3);
|
r3_3 = Shuffle<1>(r3_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
r0_0 = VectorAdd(r0_0, state0);
|
r0_0 = VecAdd(r0_0, state0);
|
||||||
r0_1 = VectorAdd(r0_1, state1);
|
r0_1 = VecAdd(r0_1, state1);
|
||||||
r0_2 = VectorAdd(r0_2, state2);
|
r0_2 = VecAdd(r0_2, state2);
|
||||||
r0_3 = VectorAdd(r0_3, state3);
|
r0_3 = VecAdd(r0_3, state3);
|
||||||
|
|
||||||
r1_0 = VectorAdd(r1_0, state0);
|
r1_0 = VecAdd(r1_0, state0);
|
||||||
r1_1 = VectorAdd(r1_1, state1);
|
r1_1 = VecAdd(r1_1, state1);
|
||||||
r1_2 = VectorAdd(r1_2, state2);
|
r1_2 = VecAdd(r1_2, state2);
|
||||||
r1_3 = VectorAdd(r1_3, state3);
|
r1_3 = VecAdd(r1_3, state3);
|
||||||
r1_3 = VectorAdd64(r1_3, CTRS[0]);
|
r1_3 = VecAdd64(r1_3, CTRS[0]);
|
||||||
|
|
||||||
r2_0 = VectorAdd(r2_0, state0);
|
r2_0 = VecAdd(r2_0, state0);
|
||||||
r2_1 = VectorAdd(r2_1, state1);
|
r2_1 = VecAdd(r2_1, state1);
|
||||||
r2_2 = VectorAdd(r2_2, state2);
|
r2_2 = VecAdd(r2_2, state2);
|
||||||
r2_3 = VectorAdd(r2_3, state3);
|
r2_3 = VecAdd(r2_3, state3);
|
||||||
r2_3 = VectorAdd64(r2_3, CTRS[1]);
|
r2_3 = VecAdd64(r2_3, CTRS[1]);
|
||||||
|
|
||||||
r3_0 = VectorAdd(r3_0, state0);
|
r3_0 = VecAdd(r3_0, state0);
|
||||||
r3_1 = VectorAdd(r3_1, state1);
|
r3_1 = VecAdd(r3_1, state1);
|
||||||
r3_2 = VectorAdd(r3_2, state2);
|
r3_2 = VecAdd(r3_2, state2);
|
||||||
r3_3 = VectorAdd(r3_3, state3);
|
r3_3 = VecAdd(r3_3, state3);
|
||||||
r3_3 = VectorAdd64(r3_3, CTRS[2]);
|
r3_3 = VecAdd64(r3_3, CTRS[2]);
|
||||||
|
|
||||||
if (input)
|
if (input)
|
||||||
{
|
{
|
||||||
r0_0 = VectorXor(VectorLoad32LE(input + 0*16), r0_0);
|
r0_0 = VecXor(VecLoad32LE(input + 0*16), r0_0);
|
||||||
r0_1 = VectorXor(VectorLoad32LE(input + 1*16), r0_1);
|
r0_1 = VecXor(VecLoad32LE(input + 1*16), r0_1);
|
||||||
r0_2 = VectorXor(VectorLoad32LE(input + 2*16), r0_2);
|
r0_2 = VecXor(VecLoad32LE(input + 2*16), r0_2);
|
||||||
r0_3 = VectorXor(VectorLoad32LE(input + 3*16), r0_3);
|
r0_3 = VecXor(VecLoad32LE(input + 3*16), r0_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStore32LE(output + 0*16, r0_0);
|
VecStore32LE(output + 0*16, r0_0);
|
||||||
VectorStore32LE(output + 1*16, r0_1);
|
VecStore32LE(output + 1*16, r0_1);
|
||||||
VectorStore32LE(output + 2*16, r0_2);
|
VecStore32LE(output + 2*16, r0_2);
|
||||||
VectorStore32LE(output + 3*16, r0_3);
|
VecStore32LE(output + 3*16, r0_3);
|
||||||
|
|
||||||
if (input)
|
if (input)
|
||||||
{
|
{
|
||||||
r1_0 = VectorXor(VectorLoad32LE(input + 4*16), r1_0);
|
r1_0 = VecXor(VecLoad32LE(input + 4*16), r1_0);
|
||||||
r1_1 = VectorXor(VectorLoad32LE(input + 5*16), r1_1);
|
r1_1 = VecXor(VecLoad32LE(input + 5*16), r1_1);
|
||||||
r1_2 = VectorXor(VectorLoad32LE(input + 6*16), r1_2);
|
r1_2 = VecXor(VecLoad32LE(input + 6*16), r1_2);
|
||||||
r1_3 = VectorXor(VectorLoad32LE(input + 7*16), r1_3);
|
r1_3 = VecXor(VecLoad32LE(input + 7*16), r1_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStore32LE(output + 4*16, r1_0);
|
VecStore32LE(output + 4*16, r1_0);
|
||||||
VectorStore32LE(output + 5*16, r1_1);
|
VecStore32LE(output + 5*16, r1_1);
|
||||||
VectorStore32LE(output + 6*16, r1_2);
|
VecStore32LE(output + 6*16, r1_2);
|
||||||
VectorStore32LE(output + 7*16, r1_3);
|
VecStore32LE(output + 7*16, r1_3);
|
||||||
|
|
||||||
if (input)
|
if (input)
|
||||||
{
|
{
|
||||||
r2_0 = VectorXor(VectorLoad32LE(input + 8*16), r2_0);
|
r2_0 = VecXor(VecLoad32LE(input + 8*16), r2_0);
|
||||||
r2_1 = VectorXor(VectorLoad32LE(input + 9*16), r2_1);
|
r2_1 = VecXor(VecLoad32LE(input + 9*16), r2_1);
|
||||||
r2_2 = VectorXor(VectorLoad32LE(input + 10*16), r2_2);
|
r2_2 = VecXor(VecLoad32LE(input + 10*16), r2_2);
|
||||||
r2_3 = VectorXor(VectorLoad32LE(input + 11*16), r2_3);
|
r2_3 = VecXor(VecLoad32LE(input + 11*16), r2_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStore32LE(output + 8*16, r2_0);
|
VecStore32LE(output + 8*16, r2_0);
|
||||||
VectorStore32LE(output + 9*16, r2_1);
|
VecStore32LE(output + 9*16, r2_1);
|
||||||
VectorStore32LE(output + 10*16, r2_2);
|
VecStore32LE(output + 10*16, r2_2);
|
||||||
VectorStore32LE(output + 11*16, r2_3);
|
VecStore32LE(output + 11*16, r2_3);
|
||||||
|
|
||||||
if (input)
|
if (input)
|
||||||
{
|
{
|
||||||
r3_0 = VectorXor(VectorLoad32LE(input + 12*16), r3_0);
|
r3_0 = VecXor(VecLoad32LE(input + 12*16), r3_0);
|
||||||
r3_1 = VectorXor(VectorLoad32LE(input + 13*16), r3_1);
|
r3_1 = VecXor(VecLoad32LE(input + 13*16), r3_1);
|
||||||
r3_2 = VectorXor(VectorLoad32LE(input + 14*16), r3_2);
|
r3_2 = VecXor(VecLoad32LE(input + 14*16), r3_2);
|
||||||
r3_3 = VectorXor(VectorLoad32LE(input + 15*16), r3_3);
|
r3_3 = VecXor(VecLoad32LE(input + 15*16), r3_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStore32LE(output + 12*16, r3_0);
|
VecStore32LE(output + 12*16, r3_0);
|
||||||
VectorStore32LE(output + 13*16, r3_1);
|
VecStore32LE(output + 13*16, r3_1);
|
||||||
VectorStore32LE(output + 14*16, r3_2);
|
VecStore32LE(output + 14*16, r3_2);
|
||||||
VectorStore32LE(output + 15*16, r3_3);
|
VecStore32LE(output + 15*16, r3_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
136
gcm_simd.cpp
136
gcm_simd.cpp
@ -171,16 +171,16 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
|
|||||||
#if CRYPTOPP_POWER8_VMULL_AVAILABLE
|
#if CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||||
using CryptoPP::uint32x4_p;
|
using CryptoPP::uint32x4_p;
|
||||||
using CryptoPP::uint64x2_p;
|
using CryptoPP::uint64x2_p;
|
||||||
using CryptoPP::VectorGetLow;
|
using CryptoPP::VecGetLow;
|
||||||
using CryptoPP::VectorGetHigh;
|
using CryptoPP::VecGetHigh;
|
||||||
using CryptoPP::VectorRotateLeftOctet;
|
using CryptoPP::VecRotateLeftOctet;
|
||||||
|
|
||||||
// POWER8 GCM mode is confusing. The algorithm is reflected so
|
// POWER8 GCM mode is confusing. The algorithm is reflected so
|
||||||
// nearly everything we do is reversed for a little-endian system,
|
// nearly everything we do is reversed for a little-endian system,
|
||||||
// including on big-endian machines. VMULL2LE swaps dwords for a
|
// including on big-endian machines. VMULL2LE swaps dwords for a
|
||||||
// little endian machine; VMULL_00LE, VMULL_01LE, VMULL_10LE and
|
// little endian machine; VMULL_00LE, VMULL_01LE, VMULL_10LE and
|
||||||
// VMULL_11LE are backwards and (1) read low words with
|
// VMULL_11LE are backwards and (1) read low words with
|
||||||
// VectorGetHigh, (2) read high words with VectorGetLow, and
|
// VecGetHigh, (2) read high words with VecGetLow, and
|
||||||
// (3) yields a product that is endian swapped. The steps ensures
|
// (3) yields a product that is endian swapped. The steps ensures
|
||||||
// GCM parameters are presented in the correct order for the
|
// GCM parameters are presented in the correct order for the
|
||||||
// algorithm on both big and little-endian systems, but it is
|
// algorithm on both big and little-endian systems, but it is
|
||||||
@ -192,7 +192,7 @@ using CryptoPP::VectorRotateLeftOctet;
|
|||||||
inline uint64x2_p VMULL2LE(const uint64x2_p& val)
|
inline uint64x2_p VMULL2LE(const uint64x2_p& val)
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
return VectorRotateLeftOctet<8>(val);
|
return VecRotateLeftOctet<8>(val);
|
||||||
#else
|
#else
|
||||||
return val;
|
return val;
|
||||||
#endif
|
#endif
|
||||||
@ -202,48 +202,48 @@ inline uint64x2_p VMULL2LE(const uint64x2_p& val)
|
|||||||
inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b)
|
inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return VMULL2LE(__vpmsumd (VectorGetHigh(a), VectorGetHigh(b)));
|
return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b)));
|
||||||
#else
|
#else
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetHigh(a), VectorGetHigh(b)));
|
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b)));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x01)
|
// _mm_clmulepi64_si128(a, b, 0x01)
|
||||||
inline uint64x2_p VMULL_01LE(const uint64x2_p& a, const uint64x2_p& b)
|
inline uint64x2_p VMULL_01LE(const uint64x2_p& a, const uint64x2_p& b)
|
||||||
{
|
{
|
||||||
// Small speedup. VectorGetHigh(b) ensures the high dword of 'b' is 0.
|
// Small speedup. VecGetHigh(b) ensures the high dword of 'b' is 0.
|
||||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
// The 0 used in the vmull yields 0 for the high product, so the high
|
||||||
// dword of 'a' is "don't care".
|
// dword of 'a' is "don't care".
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return VMULL2LE(__vpmsumd (a, VectorGetHigh(b)));
|
return VMULL2LE(__vpmsumd (a, VecGetHigh(b)));
|
||||||
#else
|
#else
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (a, VectorGetHigh(b)));
|
return VMULL2LE(__builtin_crypto_vpmsumd (a, VecGetHigh(b)));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x10)
|
// _mm_clmulepi64_si128(a, b, 0x10)
|
||||||
inline uint64x2_p VMULL_10LE(const uint64x2_p& a, const uint64x2_p& b)
|
inline uint64x2_p VMULL_10LE(const uint64x2_p& a, const uint64x2_p& b)
|
||||||
{
|
{
|
||||||
// Small speedup. VectorGetHigh(a) ensures the high dword of 'a' is 0.
|
// Small speedup. VecGetHigh(a) ensures the high dword of 'a' is 0.
|
||||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
// The 0 used in the vmull yields 0 for the high product, so the high
|
||||||
// dword of 'b' is "don't care".
|
// dword of 'b' is "don't care".
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return VMULL2LE(__vpmsumd (VectorGetHigh(a), b));
|
return VMULL2LE(__vpmsumd (VecGetHigh(a), b));
|
||||||
#else
|
#else
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetHigh(a), b));
|
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), b));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x11)
|
// _mm_clmulepi64_si128(a, b, 0x11)
|
||||||
inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b)
|
inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b)
|
||||||
{
|
{
|
||||||
// Small speedup. VectorGetLow(a) ensures the high dword of 'a' is 0.
|
// Small speedup. VecGetLow(a) ensures the high dword of 'a' is 0.
|
||||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
// The 0 used in the vmull yields 0 for the high product, so the high
|
||||||
// dword of 'b' is "don't care".
|
// dword of 'b' is "don't care".
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return VMULL2LE(__vpmsumd (VectorGetLow(a), b));
|
return VMULL2LE(__vpmsumd (VecGetLow(a), b));
|
||||||
#else
|
#else
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (VectorGetLow(a), b));
|
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||||
@ -373,7 +373,7 @@ bool CPU_ProbePMULL()
|
|||||||
const uint64x2_p r3 = VMULL_10LE((uint64x2_p)(a), (uint64x2_p)(b));
|
const uint64x2_p r3 = VMULL_10LE((uint64x2_p)(a), (uint64x2_p)(b));
|
||||||
const uint64x2_p r4 = VMULL_11LE((uint64x2_p)(a), (uint64x2_p)(b));
|
const uint64x2_p r4 = VMULL_11LE((uint64x2_p)(a), (uint64x2_p)(b));
|
||||||
|
|
||||||
result = VectorNotEqual(r1, r2) && VectorNotEqual(r3, r4);
|
result = VecNotEqual(r1, r2) && VecNotEqual(r3, r4);
|
||||||
}
|
}
|
||||||
|
|
||||||
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
|
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
|
||||||
@ -743,7 +743,7 @@ void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer)
|
|||||||
#if CRYPTOPP_ALTIVEC_AVAILABLE
|
#if CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
void GCM_Xor16_ALTIVEC(byte *a, const byte *b, const byte *c)
|
void GCM_Xor16_ALTIVEC(byte *a, const byte *b, const byte *c)
|
||||||
{
|
{
|
||||||
VectorStore(VectorXor(VectorLoad(b), VectorLoad(c)), a);
|
VecStore(VecXor(VecLoad(b), VecLoad(c)), a);
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
|
||||||
@ -753,22 +753,22 @@ uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x
|
|||||||
{
|
{
|
||||||
const uint64x2_p m1 = {1,1}, m63 = {63,63};
|
const uint64x2_p m1 = {1,1}, m63 = {63,63};
|
||||||
|
|
||||||
c1 = VectorXor(c1, VectorShiftRightOctet<8>(c0));
|
c1 = VecXor(c1, VecShiftRightOctet<8>(c0));
|
||||||
c1 = VectorXor(c1, VMULL_10LE(c0, r));
|
c1 = VecXor(c1, VMULL_10LE(c0, r));
|
||||||
c0 = VectorXor(c1, VectorShiftLeftOctet<8>(c0));
|
c0 = VecXor(c1, VecShiftLeftOctet<8>(c0));
|
||||||
c0 = VMULL_00LE(vec_sl(c0, m1), r);
|
c0 = VMULL_00LE(vec_sl(c0, m1), r);
|
||||||
c2 = VectorXor(c2, c0);
|
c2 = VecXor(c2, c0);
|
||||||
c2 = VectorXor(c2, VectorShiftLeftOctet<8>(c1));
|
c2 = VecXor(c2, VecShiftLeftOctet<8>(c1));
|
||||||
c1 = vec_sr(vec_mergeh(c1, c2), m63);
|
c1 = vec_sr(vec_mergeh(c1, c2), m63);
|
||||||
c2 = vec_sl(c2, m1);
|
c2 = vec_sl(c2, m1);
|
||||||
|
|
||||||
return VectorXor(c2, c1);
|
return VecXor(c2, c1);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
||||||
{
|
{
|
||||||
const uint64x2_p c0 = VMULL_00LE(x, h);
|
const uint64x2_p c0 = VMULL_00LE(x, h);
|
||||||
const uint64x2_p c1 = VectorXor(VMULL_01LE(x, h), VMULL_10LE(x, h));
|
const uint64x2_p c1 = VecXor(VMULL_01LE(x, h), VMULL_10LE(x, h));
|
||||||
const uint64x2_p c2 = VMULL_11LE(x, h);
|
const uint64x2_p c2 = VMULL_11LE(x, h);
|
||||||
|
|
||||||
return GCM_Reduce_VMULL(c0, c1, c2, r);
|
return GCM_Reduce_VMULL(c0, c1, c2, r);
|
||||||
@ -777,13 +777,13 @@ inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
|||||||
inline uint64x2_p LoadHashKey(const byte *hashKey)
|
inline uint64x2_p LoadHashKey(const byte *hashKey)
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey);
|
const uint64x2_p key = (uint64x2_p)VecLoad(hashKey);
|
||||||
const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
|
const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
|
||||||
return vec_perm(key, key, mask);
|
return VecPermute(key, key, mask);
|
||||||
#else
|
#else
|
||||||
const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey);
|
const uint64x2_p key = (uint64x2_p)VecLoad(hashKey);
|
||||||
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||||
return vec_perm(key, key, mask);
|
return VecPermute(key, key, mask);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -798,21 +798,21 @@ void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned
|
|||||||
for (i=0; i<tableSize-32; i+=32)
|
for (i=0; i<tableSize-32; i+=32)
|
||||||
{
|
{
|
||||||
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
|
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
|
||||||
VectorStore(h, (byte*)temp);
|
VecStore(h, (byte*)temp);
|
||||||
std::memcpy(mulTable+i, temp+0, 8);
|
std::memcpy(mulTable+i, temp+0, 8);
|
||||||
VectorStore(h1, mulTable+i+16);
|
VecStore(h1, mulTable+i+16);
|
||||||
VectorStore(h, mulTable+i+8);
|
VecStore(h, mulTable+i+8);
|
||||||
VectorStore(h1, (byte*)temp);
|
VecStore(h1, (byte*)temp);
|
||||||
std::memcpy(mulTable+i+8, temp+0, 8);
|
std::memcpy(mulTable+i+8, temp+0, 8);
|
||||||
h = GCM_Multiply_VMULL(h1, h0, r);
|
h = GCM_Multiply_VMULL(h1, h0, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
|
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
|
||||||
VectorStore(h, (byte*)temp);
|
VecStore(h, (byte*)temp);
|
||||||
std::memcpy(mulTable+i, temp+0, 8);
|
std::memcpy(mulTable+i, temp+0, 8);
|
||||||
VectorStore(h1, mulTable+i+16);
|
VecStore(h1, mulTable+i+16);
|
||||||
VectorStore(h, mulTable+i+8);
|
VecStore(h, mulTable+i+8);
|
||||||
VectorStore(h1, (byte*)temp);
|
VecStore(h1, (byte*)temp);
|
||||||
std::memcpy(mulTable+i+8, temp+0, 8);
|
std::memcpy(mulTable+i+8, temp+0, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -820,33 +820,33 @@ void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned
|
|||||||
template <class T>
|
template <class T>
|
||||||
inline T SwapWords(const T& data)
|
inline T SwapWords(const T& data)
|
||||||
{
|
{
|
||||||
return (T)VectorRotateLeftOctet<8>(data);
|
return (T)VecRotateLeftOctet<8>(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint64x2_p LoadBuffer1(const byte *dataBuffer)
|
inline uint64x2_p LoadBuffer1(const byte *dataBuffer)
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
return (uint64x2_p)VectorLoad(dataBuffer);
|
return (uint64x2_p)VecLoad(dataBuffer);
|
||||||
#else
|
#else
|
||||||
const uint64x2_p data = (uint64x2_p)VectorLoad(dataBuffer);
|
const uint64x2_p data = (uint64x2_p)VecLoad(dataBuffer);
|
||||||
const uint8x16_p mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
const uint8x16_p mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
||||||
return vec_perm(data, data, mask);
|
return VecPermute(data, data, mask);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint64x2_p LoadBuffer2(const byte *dataBuffer)
|
inline uint64x2_p LoadBuffer2(const byte *dataBuffer)
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
return (uint64x2_p)SwapWords(VectorLoadBE(dataBuffer));
|
return (uint64x2_p)SwapWords(VecLoadBE(dataBuffer));
|
||||||
#else
|
#else
|
||||||
return (uint64x2_p)VectorLoadBE(dataBuffer);
|
return (uint64x2_p)VecLoadBE(dataBuffer);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
|
size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
|
||||||
{
|
{
|
||||||
const uint64x2_p r = {0xe100000000000000ull, 0xc200000000000000ull};
|
const uint64x2_p r = {0xe100000000000000ull, 0xc200000000000000ull};
|
||||||
uint64x2_p x = (uint64x2_p)VectorLoad(hbuffer);
|
uint64x2_p x = (uint64x2_p)VecLoad(hbuffer);
|
||||||
|
|
||||||
while (len >= 16)
|
while (len >= 16)
|
||||||
{
|
{
|
||||||
@ -856,59 +856,59 @@ size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mt
|
|||||||
|
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
const uint64x2_p h0 = (uint64x2_p)VectorLoad(mtable+(i+0)*16);
|
const uint64x2_p h0 = (uint64x2_p)VecLoad(mtable+(i+0)*16);
|
||||||
const uint64x2_p h1 = (uint64x2_p)VectorLoad(mtable+(i+1)*16);
|
const uint64x2_p h1 = (uint64x2_p)VecLoad(mtable+(i+1)*16);
|
||||||
const uint64x2_p h2 = (uint64x2_p)VectorXor(h0, h1);
|
const uint64x2_p h2 = (uint64x2_p)VecXor(h0, h1);
|
||||||
|
|
||||||
if (++i == s)
|
if (++i == s)
|
||||||
{
|
{
|
||||||
d1 = LoadBuffer2(data);
|
d1 = LoadBuffer2(data);
|
||||||
d1 = VectorXor(d1, x);
|
d1 = VecXor(d1, x);
|
||||||
c0 = VectorXor(c0, VMULL_00LE(d1, h0));
|
c0 = VecXor(c0, VMULL_00LE(d1, h0));
|
||||||
c2 = VectorXor(c2, VMULL_01LE(d1, h1));
|
c2 = VecXor(c2, VMULL_01LE(d1, h1));
|
||||||
d1 = VectorXor(d1, SwapWords(d1));
|
d1 = VecXor(d1, SwapWords(d1));
|
||||||
c1 = VectorXor(c1, VMULL_00LE(d1, h2));
|
c1 = VecXor(c1, VMULL_00LE(d1, h2));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
d1 = LoadBuffer1(data+(s-i)*16-8);
|
d1 = LoadBuffer1(data+(s-i)*16-8);
|
||||||
c0 = VectorXor(c0, VMULL_01LE(d2, h0));
|
c0 = VecXor(c0, VMULL_01LE(d2, h0));
|
||||||
c2 = VectorXor(c2, VMULL_01LE(d1, h1));
|
c2 = VecXor(c2, VMULL_01LE(d1, h1));
|
||||||
d2 = VectorXor(d2, d1);
|
d2 = VecXor(d2, d1);
|
||||||
c1 = VectorXor(c1, VMULL_01LE(d2, h2));
|
c1 = VecXor(c1, VMULL_01LE(d2, h2));
|
||||||
|
|
||||||
if (++i == s)
|
if (++i == s)
|
||||||
{
|
{
|
||||||
d1 = LoadBuffer2(data);
|
d1 = LoadBuffer2(data);
|
||||||
d1 = VectorXor(d1, x);
|
d1 = VecXor(d1, x);
|
||||||
c0 = VectorXor(c0, VMULL_10LE(d1, h0));
|
c0 = VecXor(c0, VMULL_10LE(d1, h0));
|
||||||
c2 = VectorXor(c2, VMULL_11LE(d1, h1));
|
c2 = VecXor(c2, VMULL_11LE(d1, h1));
|
||||||
d1 = VectorXor(d1, SwapWords(d1));
|
d1 = VecXor(d1, SwapWords(d1));
|
||||||
c1 = VectorXor(c1, VMULL_10LE(d1, h2));
|
c1 = VecXor(c1, VMULL_10LE(d1, h2));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
d2 = LoadBuffer2(data+(s-i)*16-8);
|
d2 = LoadBuffer2(data+(s-i)*16-8);
|
||||||
c0 = VectorXor(c0, VMULL_10LE(d1, h0));
|
c0 = VecXor(c0, VMULL_10LE(d1, h0));
|
||||||
c2 = VectorXor(c2, VMULL_10LE(d2, h1));
|
c2 = VecXor(c2, VMULL_10LE(d2, h1));
|
||||||
d1 = VectorXor(d1, d2);
|
d1 = VecXor(d1, d2);
|
||||||
c1 = VectorXor(c1, VMULL_10LE(d1, h2));
|
c1 = VecXor(c1, VMULL_10LE(d1, h2));
|
||||||
}
|
}
|
||||||
data += s*16;
|
data += s*16;
|
||||||
len -= s*16;
|
len -= s*16;
|
||||||
|
|
||||||
c1 = VectorXor(VectorXor(c1, c0), c2);
|
c1 = VecXor(VecXor(c1, c0), c2);
|
||||||
x = GCM_Reduce_VMULL(c0, c1, c2, r);
|
x = GCM_Reduce_VMULL(c0, c1, c2, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStore(x, hbuffer);
|
VecStore(x, hbuffer);
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer)
|
void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer)
|
||||||
{
|
{
|
||||||
const uint64x2_p mask = {0x08090a0b0c0d0e0full, 0x0001020304050607ull};
|
const uint64x2_p mask = {0x08090a0b0c0d0e0full, 0x0001020304050607ull};
|
||||||
VectorStore(VectorPermute(VectorLoad(hashBuffer), mask), hashBuffer);
|
VecStore(VecPermute(VecLoad(hashBuffer), mask), hashBuffer);
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||||
|
|
||||||
|
18
lea_simd.cpp
18
lea_simd.cpp
@ -439,17 +439,17 @@ using CryptoPP::uint64x2_p;
|
|||||||
|
|
||||||
inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
|
inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
|
||||||
{
|
{
|
||||||
return vec_xor(a, b);
|
return VecXor(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
|
inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
|
||||||
{
|
{
|
||||||
return vec_add(a, b);
|
return VecAdd(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
|
inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
|
||||||
{
|
{
|
||||||
return vec_sub(a, b);
|
return VecSub(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
@ -479,7 +479,7 @@ inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uin
|
|||||||
CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
|
CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
|
||||||
CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
|
CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
|
||||||
CRYPTOPP_ASSERT(0);
|
CRYPTOPP_ASSERT(0);
|
||||||
return vec_xor(a, a);
|
return VecXor(a, a);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -519,7 +519,7 @@ inline uint32x4_p UnpackSIMD(const uint32x4_p& v)
|
|||||||
{
|
{
|
||||||
// Should not be instantiated
|
// Should not be instantiated
|
||||||
CRYPTOPP_ASSERT(0);
|
CRYPTOPP_ASSERT(0);
|
||||||
return vec_xor(v, v);
|
return VecXor(v, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -527,7 +527,7 @@ inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v)
|
|||||||
{
|
{
|
||||||
// Splat to all lanes
|
// Splat to all lanes
|
||||||
const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
|
const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
|
||||||
return (uint32x4_p)vec_perm(v, v, m);
|
return (uint32x4_p)VecPermute(v, v, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -535,7 +535,7 @@ inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v)
|
|||||||
{
|
{
|
||||||
// Splat to all lanes
|
// Splat to all lanes
|
||||||
const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
|
const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
|
||||||
return (uint32x4_p)vec_perm(v, v, m);
|
return (uint32x4_p)VecPermute(v, v, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -543,7 +543,7 @@ inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v)
|
|||||||
{
|
{
|
||||||
// Splat to all lanes
|
// Splat to all lanes
|
||||||
const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
|
const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
|
||||||
return (uint32x4_p)vec_perm(v, v, m);
|
return (uint32x4_p)VecPermute(v, v, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -551,7 +551,7 @@ inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v)
|
|||||||
{
|
{
|
||||||
// Splat to all lanes
|
// Splat to all lanes
|
||||||
const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
|
const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
|
||||||
return (uint32x4_p)vec_perm(v, v, m);
|
return (uint32x4_p)VecPermute(v, v, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int IDX>
|
template <unsigned int IDX>
|
||||||
|
@ -73,7 +73,7 @@ bool CPU_ProbeAltivec()
|
|||||||
// Specifically call the Altivec loads and stores
|
// Specifically call the Altivec loads and stores
|
||||||
const uint8x16_p v1 = (uint8x16_p)vec_ld(0, (byte*)b1);
|
const uint8x16_p v1 = (uint8x16_p)vec_ld(0, (byte*)b1);
|
||||||
const uint8x16_p v2 = (uint8x16_p)vec_ld(0, (byte*)b2);
|
const uint8x16_p v2 = (uint8x16_p)vec_ld(0, (byte*)b2);
|
||||||
const uint8x16_p v3 = (uint8x16_p)vec_xor(v1, v2);
|
const uint8x16_p v3 = (uint8x16_p)VecXor(v1, v2);
|
||||||
vec_st(v3, 0, b3);
|
vec_st(v3, 0, b3);
|
||||||
|
|
||||||
result = (0 == std::memcmp(b2, b3, 16));
|
result = (0 == std::memcmp(b2, b3, 16));
|
||||||
|
314
ppc_simd.h
314
ppc_simd.h
@ -29,7 +29,7 @@
|
|||||||
# undef bool
|
# undef bool
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// VectorLoad_ALTIVEC and VectorStore_ALTIVEC are
|
// VecLoad_ALTIVEC and VecStore_ALTIVEC are
|
||||||
// too noisy on modern compilers
|
// too noisy on modern compilers
|
||||||
#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
|
#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
|
||||||
# pragma GCC diagnostic push
|
# pragma GCC diagnostic push
|
||||||
@ -49,14 +49,14 @@ typedef __vector unsigned int uint32x4_p;
|
|||||||
typedef __vector unsigned long long uint64x2_p;
|
typedef __vector unsigned long long uint64x2_p;
|
||||||
#endif // _ARCH_PWR8
|
#endif // _ARCH_PWR8
|
||||||
|
|
||||||
/// \brief Reverse a vector
|
/// \brief Reverse bytes in a vector
|
||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param src the vector
|
/// \param src the vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details Reverse() endian swaps the bytes in a vector
|
/// \details VecReverse() reverses the bytes in a vector
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T>
|
template <class T>
|
||||||
inline T Reverse(const T src)
|
inline T VecReverse(const T src)
|
||||||
{
|
{
|
||||||
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||||
return (T)vec_perm(src, src, mask);
|
return (T)vec_perm(src, src, mask);
|
||||||
@ -67,16 +67,16 @@ inline T Reverse(const T src)
|
|||||||
/// \brief Loads a vector from a byte array
|
/// \brief Loads a vector from a byte array
|
||||||
/// \param src the byte array
|
/// \param src the byte array
|
||||||
/// \details Loads a vector in native endian format from a byte array.
|
/// \details Loads a vector in native endian format from a byte array.
|
||||||
/// \details VectorLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
|
/// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
|
||||||
/// of <tt>dest</tt> is aligned, and uses <tt>vec_lvsl</tt> and <tt>vec_perm</tt>
|
/// of <tt>dest</tt> is aligned, and uses <tt>vec_lvsl</tt> and <tt>vec_perm</tt>
|
||||||
/// otherwise.
|
/// otherwise.
|
||||||
/// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so you should
|
/// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so you should
|
||||||
/// provide aligned memory adresses.
|
/// provide aligned memory adresses.
|
||||||
/// \details VectorLoad_ALTIVEC() is used automatically when POWER7 or above
|
/// \details VecLoad_ALTIVEC() is used automatically when POWER7 or above
|
||||||
/// and unaligned loads is not available.
|
/// and unaligned loads is not available.
|
||||||
/// \note VectorLoad does not require an aligned array.
|
/// \note VecLoad does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
inline uint32x4_p VectorLoad_ALTIVEC(const byte src[16])
|
inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
|
||||||
{
|
{
|
||||||
if (IsAlignedOn(src, 16))
|
if (IsAlignedOn(src, 16))
|
||||||
{
|
{
|
||||||
@ -96,14 +96,14 @@ inline uint32x4_p VectorLoad_ALTIVEC(const byte src[16])
|
|||||||
/// \param src the byte array
|
/// \param src the byte array
|
||||||
/// \param off offset into the src byte array
|
/// \param off offset into the src byte array
|
||||||
/// \details Loads a vector in native endian format from a byte array.
|
/// \details Loads a vector in native endian format from a byte array.
|
||||||
/// \details VectorLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
|
/// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
|
||||||
/// of <tt>dest</tt> is aligned, and uses <tt>vec_lvsl</tt> and <tt>vec_perm</tt>
|
/// of <tt>dest</tt> is aligned, and uses <tt>vec_lvsl</tt> and <tt>vec_perm</tt>
|
||||||
/// otherwise.
|
/// otherwise.
|
||||||
/// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so you should
|
/// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so you should
|
||||||
/// provide aligned memory adresses.
|
/// provide aligned memory adresses.
|
||||||
/// \note VectorLoad does not require an aligned array.
|
/// \note VecLoad does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
inline uint32x4_p VectorLoad_ALTIVEC(int off, const byte src[16])
|
inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
|
||||||
{
|
{
|
||||||
if (IsAlignedOn(src, 16))
|
if (IsAlignedOn(src, 16))
|
||||||
{
|
{
|
||||||
@ -122,14 +122,14 @@ inline uint32x4_p VectorLoad_ALTIVEC(int off, const byte src[16])
|
|||||||
/// \brief Loads a vector from a byte array
|
/// \brief Loads a vector from a byte array
|
||||||
/// \param src the byte array
|
/// \param src the byte array
|
||||||
/// \details Loads a vector in native endian format from a byte array.
|
/// \details Loads a vector in native endian format from a byte array.
|
||||||
/// \details VectorLoad uses POWER7's <tt>vec_xl</tt> or
|
/// \details VecLoad uses POWER7's <tt>vec_xl</tt> or
|
||||||
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
|
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
|
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorLoad does not require an aligned array.
|
/// \note VecLoad does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
inline uint32x4_p VectorLoad(const byte src[16])
|
inline uint32x4_p VecLoad(const byte src[16])
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR7)
|
#if defined(_ARCH_PWR7)
|
||||||
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
@ -138,7 +138,7 @@ inline uint32x4_p VectorLoad(const byte src[16])
|
|||||||
return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
|
return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
|
||||||
# endif
|
# endif
|
||||||
#else
|
#else
|
||||||
return VectorLoad_ALTIVEC(src);
|
return VecLoad_ALTIVEC(src);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -146,14 +146,14 @@ inline uint32x4_p VectorLoad(const byte src[16])
|
|||||||
/// \param src the byte array
|
/// \param src the byte array
|
||||||
/// \param off offset into the byte array
|
/// \param off offset into the byte array
|
||||||
/// \details Loads a vector in native endian format from a byte array.
|
/// \details Loads a vector in native endian format from a byte array.
|
||||||
/// \details VectorLoad uses POWER7's <tt>vec_xl</tt> or
|
/// \details VecLoad uses POWER7's <tt>vec_xl</tt> or
|
||||||
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
|
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
|
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorLoad does not require an aligned array.
|
/// \note VecLoad does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
inline uint32x4_p VectorLoad(int off, const byte src[16])
|
inline uint32x4_p VecLoad(int off, const byte src[16])
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR7)
|
#if defined(_ARCH_PWR7)
|
||||||
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
@ -162,48 +162,48 @@ inline uint32x4_p VectorLoad(int off, const byte src[16])
|
|||||||
return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
|
return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
|
||||||
# endif
|
# endif
|
||||||
#else
|
#else
|
||||||
return VectorLoad_ALTIVEC(off, src);
|
return VecLoad_ALTIVEC(off, src);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Loads a vector from a byte array
|
/// \brief Loads a vector from a byte array
|
||||||
/// \param src the byte array
|
/// \param src the byte array
|
||||||
/// \details Loads a vector in native endian format from a byte array.
|
/// \details Loads a vector in native endian format from a byte array.
|
||||||
/// \details VectorLoad uses POWER7's <tt>vec_xl</tt> or
|
/// \details VecLoad uses POWER7's <tt>vec_xl</tt> or
|
||||||
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
|
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
|
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorLoad does not require an aligned array.
|
/// \note VecLoad does not require an aligned array.
|
||||||
/// \since Crypto++ 8.0
|
/// \since Crypto++ 8.0
|
||||||
inline uint32x4_p VectorLoad(const word32 src[4])
|
inline uint32x4_p VecLoad(const word32 src[4])
|
||||||
{
|
{
|
||||||
return VectorLoad((const byte*)src);
|
return VecLoad((const byte*)src);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Loads a vector from a byte array
|
/// \brief Loads a vector from a byte array
|
||||||
/// \param src the byte array
|
/// \param src the byte array
|
||||||
/// \param off offset into the byte array
|
/// \param off offset into the byte array
|
||||||
/// \details Loads a vector in native endian format from a byte array.
|
/// \details Loads a vector in native endian format from a byte array.
|
||||||
/// \note VectorLoad does not require an aligned array.
|
/// \note VecLoad does not require an aligned array.
|
||||||
/// \since Crypto++ 8.0
|
/// \since Crypto++ 8.0
|
||||||
inline uint32x4_p VectorLoad(int off, const word32 src[4])
|
inline uint32x4_p VecLoad(int off, const word32 src[4])
|
||||||
{
|
{
|
||||||
return VectorLoad(off, (const byte*)src);
|
return VecLoad(off, (const byte*)src);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Loads a vector from a byte array
|
/// \brief Loads a vector from a byte array
|
||||||
/// \param src the byte array
|
/// \param src the byte array
|
||||||
/// \details Loads a vector in big endian format from a byte array.
|
/// \details Loads a vector in big endian format from a byte array.
|
||||||
/// VectorLoadBE will swap all bytes on little endian systems.
|
/// VecLoadBE will swap all bytes on little endian systems.
|
||||||
/// \details VectorLoadBE uses POWER7's <tt>vec_xl</tt> or
|
/// \details VecLoadBE uses POWER7's <tt>vec_xl</tt> or
|
||||||
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
|
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
|
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorLoadBE() does not require an aligned array.
|
/// \note VecLoadBE() does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
inline uint32x4_p VectorLoadBE(const byte src[16])
|
inline uint32x4_p VecLoadBE(const byte src[16])
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR7)
|
#if defined(_ARCH_PWR7)
|
||||||
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
@ -212,14 +212,14 @@ inline uint32x4_p VectorLoadBE(const byte src[16])
|
|||||||
# if (CRYPTOPP_BIG_ENDIAN)
|
# if (CRYPTOPP_BIG_ENDIAN)
|
||||||
return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
|
return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
|
||||||
# else
|
# else
|
||||||
return (uint32x4_p)Reverse(vec_vsx_ld(0, (byte*)src));
|
return (uint32x4_p)VecReverse(vec_vsx_ld(0, (byte*)src));
|
||||||
# endif
|
# endif
|
||||||
# endif
|
# endif
|
||||||
#else // _ARCH_PWR7
|
#else // _ARCH_PWR7
|
||||||
# if (CRYPTOPP_BIG_ENDIAN)
|
# if (CRYPTOPP_BIG_ENDIAN)
|
||||||
return (uint32x4_p)VectorLoad((const byte*)src);
|
return (uint32x4_p)VecLoad((const byte*)src);
|
||||||
# else
|
# else
|
||||||
return (uint32x4_p)Reverse(VectorLoad((const byte*)src));
|
return (uint32x4_p)VecReverse(VecLoad((const byte*)src));
|
||||||
# endif
|
# endif
|
||||||
#endif // _ARCH_PWR7
|
#endif // _ARCH_PWR7
|
||||||
}
|
}
|
||||||
@ -228,15 +228,15 @@ inline uint32x4_p VectorLoadBE(const byte src[16])
|
|||||||
/// \param src the byte array
|
/// \param src the byte array
|
||||||
/// \param off offset into the src byte array
|
/// \param off offset into the src byte array
|
||||||
/// \details Loads a vector in big endian format from a byte array.
|
/// \details Loads a vector in big endian format from a byte array.
|
||||||
/// VectorLoadBE will swap all bytes on little endian systems.
|
/// VecLoadBE will swap all bytes on little endian systems.
|
||||||
/// \details VectorLoadBE uses POWER7's <tt>vec_xl</tt> or
|
/// \details VecLoadBE uses POWER7's <tt>vec_xl</tt> or
|
||||||
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorLoad_ALTIVEC() is used if POWER7 or above
|
/// \details VecLoad_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorLoad_ALTIVEC() is relatively expensive.
|
/// is not available. VecLoad_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorLoadBE does not require an aligned array.
|
/// \note VecLoadBE does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
inline uint32x4_p VectorLoadBE(int off, const byte src[16])
|
inline uint32x4_p VecLoadBE(int off, const byte src[16])
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR7)
|
#if defined(_ARCH_PWR7)
|
||||||
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
@ -245,14 +245,14 @@ inline uint32x4_p VectorLoadBE(int off, const byte src[16])
|
|||||||
# if (CRYPTOPP_BIG_ENDIAN)
|
# if (CRYPTOPP_BIG_ENDIAN)
|
||||||
return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
|
return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
|
||||||
# else
|
# else
|
||||||
return (uint32x4_p)Reverse(vec_vsx_ld(off, (byte*)src));
|
return (uint32x4_p)VecReverse(vec_vsx_ld(off, (byte*)src));
|
||||||
# endif
|
# endif
|
||||||
# endif
|
# endif
|
||||||
#else // _ARCH_PWR7
|
#else // _ARCH_PWR7
|
||||||
# if (CRYPTOPP_BIG_ENDIAN)
|
# if (CRYPTOPP_BIG_ENDIAN)
|
||||||
return (uint32x4_p)VectorLoad(off, (const byte*)src);
|
return (uint32x4_p)VecLoad(off, (const byte*)src);
|
||||||
# else
|
# else
|
||||||
return (uint32x4_p)Reverse(VectorLoad(off, (const byte*)src));
|
return (uint32x4_p)VecReverse(VecLoad(off, (const byte*)src));
|
||||||
# endif
|
# endif
|
||||||
#endif // _ARCH_PWR7
|
#endif // _ARCH_PWR7
|
||||||
}
|
}
|
||||||
@ -264,16 +264,16 @@ inline uint32x4_p VectorLoadBE(int off, const byte src[16])
|
|||||||
/// \param data the vector
|
/// \param data the vector
|
||||||
/// \param dest the byte array
|
/// \param dest the byte array
|
||||||
/// \details Stores a vector in native endian format to a byte array.
|
/// \details Stores a vector in native endian format to a byte array.
|
||||||
/// \details VectorStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
|
/// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
|
||||||
/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
|
/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
|
||||||
/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
|
/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
|
||||||
/// memory adresses.
|
/// memory adresses.
|
||||||
/// \details VectorStore_ALTIVEC() is used automatically when POWER7 or above
|
/// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
|
||||||
/// and unaligned loads is not available.
|
/// and unaligned loads is not available.
|
||||||
/// \note VectorStore does not require an aligned array.
|
/// \note VecStore does not require an aligned array.
|
||||||
/// \since Crypto++ 8.0
|
/// \since Crypto++ 8.0
|
||||||
template<class T>
|
template<class T>
|
||||||
inline void VectorStore_ALTIVEC(const T data, byte dest[16])
|
inline void VecStore_ALTIVEC(const T data, byte dest[16])
|
||||||
{
|
{
|
||||||
if (IsAlignedOn(dest, 16))
|
if (IsAlignedOn(dest, 16))
|
||||||
{
|
{
|
||||||
@ -300,16 +300,16 @@ inline void VectorStore_ALTIVEC(const T data, byte dest[16])
|
|||||||
/// \param off the byte offset into the array
|
/// \param off the byte offset into the array
|
||||||
/// \param dest the byte array
|
/// \param dest the byte array
|
||||||
/// \details Stores a vector in native endian format to a byte array.
|
/// \details Stores a vector in native endian format to a byte array.
|
||||||
/// \details VectorStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
|
/// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
|
||||||
/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
|
/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
|
||||||
/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
|
/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
|
||||||
/// memory adresses.
|
/// memory adresses.
|
||||||
/// \details VectorStore_ALTIVEC() is used automatically when POWER7 or above
|
/// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
|
||||||
/// and unaligned loads is not available.
|
/// and unaligned loads is not available.
|
||||||
/// \note VectorStore does not require an aligned array.
|
/// \note VecStore does not require an aligned array.
|
||||||
/// \since Crypto++ 8.0
|
/// \since Crypto++ 8.0
|
||||||
template<class T>
|
template<class T>
|
||||||
inline void VectorStore_ALTIVEC(const T data, int off, byte dest[16])
|
inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
|
||||||
{
|
{
|
||||||
if (IsAlignedOn(dest, 16))
|
if (IsAlignedOn(dest, 16))
|
||||||
{
|
{
|
||||||
@ -335,15 +335,15 @@ inline void VectorStore_ALTIVEC(const T data, int off, byte dest[16])
|
|||||||
/// \param data the vector
|
/// \param data the vector
|
||||||
/// \param dest the byte array
|
/// \param dest the byte array
|
||||||
/// \details Stores a vector in native endian format to a byte array.
|
/// \details Stores a vector in native endian format to a byte array.
|
||||||
/// \details VectorStore uses POWER7's <tt>vec_xst</tt> or
|
/// \details VecStore uses POWER7's <tt>vec_xst</tt> or
|
||||||
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
|
/// \details VecStore_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
|
/// is not available. VecStore_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorStore does not require an aligned array.
|
/// \note VecStore does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template<class T>
|
template<class T>
|
||||||
inline void VectorStore(const T data, byte dest[16])
|
inline void VecStore(const T data, byte dest[16])
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR7)
|
#if defined(_ARCH_PWR7)
|
||||||
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
@ -352,7 +352,7 @@ inline void VectorStore(const T data, byte dest[16])
|
|||||||
vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
|
vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
|
||||||
# endif
|
# endif
|
||||||
#else
|
#else
|
||||||
return VectorStore_ALTIVEC(data, 0, dest);
|
return VecStore_ALTIVEC(data, 0, dest);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -362,15 +362,15 @@ inline void VectorStore(const T data, byte dest[16])
|
|||||||
/// \param off the byte offset into the array
|
/// \param off the byte offset into the array
|
||||||
/// \param dest the byte array
|
/// \param dest the byte array
|
||||||
/// \details Stores a vector in native endian format to a byte array.
|
/// \details Stores a vector in native endian format to a byte array.
|
||||||
/// \details VectorStore uses POWER7's <tt>vec_xst</tt> or
|
/// \details VecStore uses POWER7's <tt>vec_xst</tt> or
|
||||||
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
|
/// \details VecStore_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
|
/// is not available. VecStore_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorStore does not require an aligned array.
|
/// \note VecStore does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template<class T>
|
template<class T>
|
||||||
inline void VectorStore(const T data, int off, byte dest[16])
|
inline void VecStore(const T data, int off, byte dest[16])
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR7)
|
#if defined(_ARCH_PWR7)
|
||||||
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
@ -379,7 +379,7 @@ inline void VectorStore(const T data, int off, byte dest[16])
|
|||||||
vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
|
vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
|
||||||
# endif
|
# endif
|
||||||
#else
|
#else
|
||||||
return VectorStore_ALTIVEC(data, off, dest);
|
return VecStore_ALTIVEC(data, off, dest);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -388,17 +388,17 @@ inline void VectorStore(const T data, int off, byte dest[16])
|
|||||||
/// \param data the vector
|
/// \param data the vector
|
||||||
/// \param dest the byte array
|
/// \param dest the byte array
|
||||||
/// \details Stores a vector in native endian format to a byte array.
|
/// \details Stores a vector in native endian format to a byte array.
|
||||||
/// \details VectorStore uses POWER7's <tt>vec_xst</tt> or
|
/// \details VecStore uses POWER7's <tt>vec_xst</tt> or
|
||||||
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
|
/// \details VecStore_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
|
/// is not available. VecStore_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorStore does not require an aligned array.
|
/// \note VecStore does not require an aligned array.
|
||||||
/// \since Crypto++ 8.0
|
/// \since Crypto++ 8.0
|
||||||
template<class T>
|
template<class T>
|
||||||
inline void VectorStore(const T data, word32 dest[4])
|
inline void VecStore(const T data, word32 dest[4])
|
||||||
{
|
{
|
||||||
VectorStore((uint8x16_p)data, 0, (byte*)dest);
|
VecStore((uint8x16_p)data, 0, (byte*)dest);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Stores a vector to a word array
|
/// \brief Stores a vector to a word array
|
||||||
@ -407,17 +407,17 @@ inline void VectorStore(const T data, word32 dest[4])
|
|||||||
/// \param off the byte offset into the array
|
/// \param off the byte offset into the array
|
||||||
/// \param dest the byte array
|
/// \param dest the byte array
|
||||||
/// \details Stores a vector in native endian format to a byte array.
|
/// \details Stores a vector in native endian format to a byte array.
|
||||||
/// \details VectorStore uses POWER7's <tt>vec_xst</tt> or
|
/// \details VecStore uses POWER7's <tt>vec_xst</tt> or
|
||||||
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
|
/// \details VecStore_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
|
/// is not available. VecStore_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorStore does not require an aligned array.
|
/// \note VecStore does not require an aligned array.
|
||||||
/// \since Crypto++ 8.0
|
/// \since Crypto++ 8.0
|
||||||
template<class T>
|
template<class T>
|
||||||
inline void VectorStore(const T data, int off, word32 dest[4])
|
inline void VecStore(const T data, int off, word32 dest[4])
|
||||||
{
|
{
|
||||||
VectorStore((uint8x16_p)data, off, (byte*)dest);
|
VecStore((uint8x16_p)data, off, (byte*)dest);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Stores a vector to a byte array
|
/// \brief Stores a vector to a byte array
|
||||||
@ -425,16 +425,16 @@ inline void VectorStore(const T data, int off, word32 dest[4])
|
|||||||
/// \param src the vector
|
/// \param src the vector
|
||||||
/// \param dest the byte array
|
/// \param dest the byte array
|
||||||
/// \details Stores a vector in big endian format to a byte array.
|
/// \details Stores a vector in big endian format to a byte array.
|
||||||
/// VectorStoreBE will swap all bytes on little endian systems.
|
/// VecStoreBE will swap all bytes on little endian systems.
|
||||||
/// \details VectorStoreBE uses POWER7's <tt>vec_xst</tt> or
|
/// \details VecStoreBE uses POWER7's <tt>vec_xst</tt> or
|
||||||
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
|
/// \details VecStore_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
|
/// is not available. VecStore_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorStoreBE does not require an aligned array.
|
/// \note VecStoreBE does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T>
|
template <class T>
|
||||||
inline void VectorStoreBE(const T src, byte dest[16])
|
inline void VecStoreBE(const T src, byte dest[16])
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR7)
|
#if defined(_ARCH_PWR7)
|
||||||
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
@ -443,14 +443,14 @@ inline void VectorStoreBE(const T src, byte dest[16])
|
|||||||
# if (CRYPTOPP_BIG_ENDIAN)
|
# if (CRYPTOPP_BIG_ENDIAN)
|
||||||
vec_vsx_st((uint8x16_p)src, 0, (byte*)dest);
|
vec_vsx_st((uint8x16_p)src, 0, (byte*)dest);
|
||||||
# else
|
# else
|
||||||
vec_vsx_st((uint8x16_p)Reverse(src), 0, (byte*)dest);
|
vec_vsx_st((uint8x16_p)VecReverse(src), 0, (byte*)dest);
|
||||||
# endif
|
# endif
|
||||||
# endif
|
# endif
|
||||||
#else // _ARCH_PWR7
|
#else // _ARCH_PWR7
|
||||||
# if (CRYPTOPP_BIG_ENDIAN)
|
# if (CRYPTOPP_BIG_ENDIAN)
|
||||||
VectorStore((uint8x16_p)src, (byte*)dest);
|
VecStore((uint8x16_p)src, (byte*)dest);
|
||||||
# else
|
# else
|
||||||
VectorStore((uint8x16_p)Reverse(src), (byte*)dest);
|
VecStore((uint8x16_p)VecReverse(src), (byte*)dest);
|
||||||
# endif
|
# endif
|
||||||
#endif // _ARCH_PWR7
|
#endif // _ARCH_PWR7
|
||||||
}
|
}
|
||||||
@ -461,16 +461,16 @@ inline void VectorStoreBE(const T src, byte dest[16])
|
|||||||
/// \param off offset into the dest byte array
|
/// \param off offset into the dest byte array
|
||||||
/// \param dest the byte array
|
/// \param dest the byte array
|
||||||
/// \details Stores a vector in big endian format to a byte array.
|
/// \details Stores a vector in big endian format to a byte array.
|
||||||
/// VectorStoreBE will swap all bytes on little endian systems.
|
/// VecStoreBE will swap all bytes on little endian systems.
|
||||||
/// \details VectorStoreBE uses POWER7's <tt>vec_xst</tt> or
|
/// \details VecStoreBE uses POWER7's <tt>vec_xst</tt> or
|
||||||
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
/// <tt>vec_vsx_st</tt> if available. The instructions do not require
|
||||||
/// an aligned memory address.
|
/// an aligned memory address.
|
||||||
/// \details VectorStore_ALTIVEC() is used if POWER7 or above
|
/// \details VecStore_ALTIVEC() is used if POWER7 or above
|
||||||
/// is not available. VectorStore_ALTIVEC() is relatively expensive.
|
/// is not available. VecStore_ALTIVEC() is relatively expensive.
|
||||||
/// \note VectorStoreBE does not require an aligned array.
|
/// \note VecStoreBE does not require an aligned array.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T>
|
template <class T>
|
||||||
inline void VectorStoreBE(const T src, int off, byte dest[16])
|
inline void VecStoreBE(const T src, int off, byte dest[16])
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR7)
|
#if defined(_ARCH_PWR7)
|
||||||
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
# if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
@ -479,14 +479,14 @@ inline void VectorStoreBE(const T src, int off, byte dest[16])
|
|||||||
# if (CRYPTOPP_BIG_ENDIAN)
|
# if (CRYPTOPP_BIG_ENDIAN)
|
||||||
vec_vsx_st((uint8x16_p)src, off, (byte*)dest);
|
vec_vsx_st((uint8x16_p)src, off, (byte*)dest);
|
||||||
# else
|
# else
|
||||||
vec_vsx_st((uint8x16_p)Reverse(src), off, (byte*)dest);
|
vec_vsx_st((uint8x16_p)VecReverse(src), off, (byte*)dest);
|
||||||
# endif
|
# endif
|
||||||
# endif
|
# endif
|
||||||
#else // _ARCH_PWR7
|
#else // _ARCH_PWR7
|
||||||
# if (CRYPTOPP_BIG_ENDIAN)
|
# if (CRYPTOPP_BIG_ENDIAN)
|
||||||
VectorStore((uint8x16_p)src, off, (byte*)dest);
|
VecStore((uint8x16_p)src, off, (byte*)dest);
|
||||||
# else
|
# else
|
||||||
VectorStore((uint8x16_p)Reverse(src), off, (byte*)dest);
|
VecStore((uint8x16_p)VecReverse(src), off, (byte*)dest);
|
||||||
# endif
|
# endif
|
||||||
#endif // _ARCH_PWR7
|
#endif // _ARCH_PWR7
|
||||||
}
|
}
|
||||||
@ -498,12 +498,12 @@ inline void VectorStoreBE(const T src, int off, byte dest[16])
|
|||||||
/// \param vec the vector
|
/// \param vec the vector
|
||||||
/// \param mask vector mask
|
/// \param mask vector mask
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorPermute returns a new vector from vec based on
|
/// \details VecPermute returns a new vector from vec based on
|
||||||
/// mask. mask is an uint8x16_p type vector. The return
|
/// mask. mask is an uint8x16_p type vector. The return
|
||||||
/// vector is the same type as vec.
|
/// vector is the same type as vec.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorPermute(const T1 vec, const T2 mask)
|
inline T1 VecPermute(const T1 vec, const T2 mask)
|
||||||
{
|
{
|
||||||
return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
|
return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
|
||||||
}
|
}
|
||||||
@ -515,12 +515,12 @@ inline T1 VectorPermute(const T1 vec, const T2 mask)
|
|||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \param mask vector mask
|
/// \param mask vector mask
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorPermute returns a new vector from vec1 and vec2
|
/// \details VecPermute returns a new vector from vec1 and vec2
|
||||||
/// based on mask. mask is an uint8x16_p type vector. The return
|
/// based on mask. mask is an uint8x16_p type vector. The return
|
||||||
/// vector is the same type as vec1.
|
/// vector is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorPermute(const T1 vec1, const T1 vec2, const T2 mask)
|
inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
|
||||||
{
|
{
|
||||||
return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask);
|
return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask);
|
||||||
}
|
}
|
||||||
@ -531,11 +531,11 @@ inline T1 VectorPermute(const T1 vec1, const T1 vec2, const T2 mask)
|
|||||||
/// \param vec1 the first vector
|
/// \param vec1 the first vector
|
||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorAnd returns a new vector from vec1 and vec2. The return
|
/// \details VecAnd returns a new vector from vec1 and vec2. The return
|
||||||
/// vector is the same type as vec1.
|
/// vector is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorAnd(const T1 vec1, const T2 vec2)
|
inline T1 VecAnd(const T1 vec1, const T2 vec2)
|
||||||
{
|
{
|
||||||
return (T1)vec_and(vec1, (T1)vec2);
|
return (T1)vec_and(vec1, (T1)vec2);
|
||||||
}
|
}
|
||||||
@ -546,11 +546,11 @@ inline T1 VectorAnd(const T1 vec1, const T2 vec2)
|
|||||||
/// \param vec1 the first vector
|
/// \param vec1 the first vector
|
||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorOr returns a new vector from vec1 and vec2. The return
|
/// \details VecOr returns a new vector from vec1 and vec2. The return
|
||||||
/// vector is the same type as vec1.
|
/// vector is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorOr(const T1 vec1, const T2 vec2)
|
inline T1 VecOr(const T1 vec1, const T2 vec2)
|
||||||
{
|
{
|
||||||
return (T1)vec_or(vec1, (T1)vec2);
|
return (T1)vec_or(vec1, (T1)vec2);
|
||||||
}
|
}
|
||||||
@ -561,11 +561,11 @@ inline T1 VectorOr(const T1 vec1, const T2 vec2)
|
|||||||
/// \param vec1 the first vector
|
/// \param vec1 the first vector
|
||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorXor returns a new vector from vec1 and vec2. The return
|
/// \details VecXor returns a new vector from vec1 and vec2. The return
|
||||||
/// vector is the same type as vec1.
|
/// vector is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorXor(const T1 vec1, const T2 vec2)
|
inline T1 VecXor(const T1 vec1, const T2 vec2)
|
||||||
{
|
{
|
||||||
return (T1)vec_xor(vec1, (T1)vec2);
|
return (T1)vec_xor(vec1, (T1)vec2);
|
||||||
}
|
}
|
||||||
@ -576,12 +576,12 @@ inline T1 VectorXor(const T1 vec1, const T2 vec2)
|
|||||||
/// \param vec1 the first vector
|
/// \param vec1 the first vector
|
||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorAdd returns a new vector from vec1 and vec2.
|
/// \details VecAdd returns a new vector from vec1 and vec2.
|
||||||
/// vec2 is cast to the same type as vec1. The return vector
|
/// vec2 is cast to the same type as vec1. The return vector
|
||||||
/// is the same type as vec1.
|
/// is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorAdd(const T1 vec1, const T2 vec2)
|
inline T1 VecAdd(const T1 vec1, const T2 vec2)
|
||||||
{
|
{
|
||||||
return (T1)vec_add(vec1, (T1)vec2);
|
return (T1)vec_add(vec1, (T1)vec2);
|
||||||
}
|
}
|
||||||
@ -591,12 +591,12 @@ inline T1 VectorAdd(const T1 vec1, const T2 vec2)
|
|||||||
/// \tparam T2 vector type
|
/// \tparam T2 vector type
|
||||||
/// \param vec1 the first vector
|
/// \param vec1 the first vector
|
||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \details VectorSub returns a new vector from vec1 and vec2.
|
/// \details VecSub returns a new vector from vec1 and vec2.
|
||||||
/// vec2 is cast to the same type as vec1. The return vector
|
/// vec2 is cast to the same type as vec1. The return vector
|
||||||
/// is the same type as vec1.
|
/// is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorSub(const T1 vec1, const T2 vec2)
|
inline T1 VecSub(const T1 vec1, const T2 vec2)
|
||||||
{
|
{
|
||||||
return (T1)vec_sub(vec1, (T1)vec2);
|
return (T1)vec_sub(vec1, (T1)vec2);
|
||||||
}
|
}
|
||||||
@ -607,10 +607,10 @@ inline T1 VectorSub(const T1 vec1, const T2 vec2)
|
|||||||
/// \param vec1 the first vector
|
/// \param vec1 the first vector
|
||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorAdd64 returns a new vector from vec1 and vec2.
|
/// \details VecAdd64 returns a new vector from vec1 and vec2.
|
||||||
/// vec1 and vec2 are added as uint64x2_p quantities.
|
/// vec1 and vec2 are added as uint64x2_p quantities.
|
||||||
/// \since Crypto++ 8.0
|
/// \since Crypto++ 8.0
|
||||||
inline uint32x4_p VectorAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
|
inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
|
||||||
{
|
{
|
||||||
#if defined(_ARCH_PWR8)
|
#if defined(_ARCH_PWR8)
|
||||||
return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
|
return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
|
||||||
@ -632,22 +632,22 @@ inline uint32x4_p VectorAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
|
|||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param vec the vector
|
/// \param vec the vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorShiftLeftOctet() returns a new vector after shifting the
|
/// \details VecShiftLeftOctet() returns a new vector after shifting the
|
||||||
/// concatenation of the zero vector and the source vector by the specified
|
/// concatenation of the zero vector and the source vector by the specified
|
||||||
/// number of bytes. The return vector is the same type as vec.
|
/// number of bytes. The return vector is the same type as vec.
|
||||||
/// \details On big endian machines VectorShiftLeftOctet() is <tt>vec_sld(a, z,
|
/// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
|
||||||
/// c)</tt>. On little endian machines VectorShiftLeftOctet() is translated to
|
/// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
|
||||||
/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
|
/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
|
||||||
/// if on a big endian machine as shown below.
|
/// if on a big endian machine as shown below.
|
||||||
/// <pre>
|
/// <pre>
|
||||||
/// uint8x16_p x = VectorLoad(ptr);
|
/// uint8x16_p x = VecLoad(ptr);
|
||||||
/// uint8x16_p y = VectorShiftLeftOctet<12>(x);
|
/// uint8x16_p y = VecShiftLeftOctet<12>(x);
|
||||||
/// </pre>
|
/// </pre>
|
||||||
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
||||||
/// endian sensitive?</A> on Stack Overflow
|
/// endian sensitive?</A> on Stack Overflow
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <unsigned int C, class T>
|
template <unsigned int C, class T>
|
||||||
inline T VectorShiftLeftOctet(const T vec)
|
inline T VecShiftLeftOctet(const T vec)
|
||||||
{
|
{
|
||||||
const T zero = {0};
|
const T zero = {0};
|
||||||
if (C >= 16)
|
if (C >= 16)
|
||||||
@ -675,22 +675,22 @@ inline T VectorShiftLeftOctet(const T vec)
|
|||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param vec the vector
|
/// \param vec the vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorShiftRightOctet() returns a new vector after shifting the
|
/// \details VecShiftRightOctet() returns a new vector after shifting the
|
||||||
/// concatenation of the zero vector and the source vector by the specified
|
/// concatenation of the zero vector and the source vector by the specified
|
||||||
/// number of bytes. The return vector is the same type as vec.
|
/// number of bytes. The return vector is the same type as vec.
|
||||||
/// \details On big endian machines VectorShiftRightOctet() is <tt>vec_sld(a, z,
|
/// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
|
||||||
/// c)</tt>. On little endian machines VectorShiftRightOctet() is translated to
|
/// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
|
||||||
/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
|
/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
|
||||||
/// if on a big endian machine as shown below.
|
/// if on a big endian machine as shown below.
|
||||||
/// <pre>
|
/// <pre>
|
||||||
/// uint8x16_p x = VectorLoad(ptr);
|
/// uint8x16_p x = VecLoad(ptr);
|
||||||
/// uint8x16_p y = VectorShiftRightOctet<12>(y);
|
/// uint8x16_p y = VecShiftRightOctet<12>(y);
|
||||||
/// </pre>
|
/// </pre>
|
||||||
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
||||||
/// endian sensitive?</A> on Stack Overflow
|
/// endian sensitive?</A> on Stack Overflow
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <unsigned int C, class T>
|
template <unsigned int C, class T>
|
||||||
inline T VectorShiftRightOctet(const T vec)
|
inline T VecShiftRightOctet(const T vec)
|
||||||
{
|
{
|
||||||
const T zero = {0};
|
const T zero = {0};
|
||||||
if (C >= 16)
|
if (C >= 16)
|
||||||
@ -718,14 +718,14 @@ inline T VectorShiftRightOctet(const T vec)
|
|||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param vec the vector
|
/// \param vec the vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorRotateLeftOctet() returns a new vector after rotating the
|
/// \details VecRotateLeftOctet() returns a new vector after rotating the
|
||||||
/// concatenation of the source vector with itself by the specified
|
/// concatenation of the source vector with itself by the specified
|
||||||
/// number of bytes. The return vector is the same type as vec.
|
/// number of bytes. The return vector is the same type as vec.
|
||||||
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
||||||
/// endian sensitive?</A> on Stack Overflow
|
/// endian sensitive?</A> on Stack Overflow
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <unsigned int C, class T>
|
template <unsigned int C, class T>
|
||||||
inline T VectorRotateLeftOctet(const T vec)
|
inline T VecRotateLeftOctet(const T vec)
|
||||||
{
|
{
|
||||||
enum { R = C&0xf };
|
enum { R = C&0xf };
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -740,14 +740,14 @@ inline T VectorRotateLeftOctet(const T vec)
|
|||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param vec the vector
|
/// \param vec the vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorRotateRightOctet() returns a new vector after rotating the
|
/// \details VecRotateRightOctet() returns a new vector after rotating the
|
||||||
/// concatenation of the source vector with itself by the specified
|
/// concatenation of the source vector with itself by the specified
|
||||||
/// number of bytes. The return vector is the same type as vec.
|
/// number of bytes. The return vector is the same type as vec.
|
||||||
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
||||||
/// endian sensitive?</A> on Stack Overflow
|
/// endian sensitive?</A> on Stack Overflow
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <unsigned int C, class T>
|
template <unsigned int C, class T>
|
||||||
inline T VectorRotateRightOctet(const T vec)
|
inline T VecRotateRightOctet(const T vec)
|
||||||
{
|
{
|
||||||
enum { R = C&0xf };
|
enum { R = C&0xf };
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -761,9 +761,9 @@ inline T VectorRotateRightOctet(const T vec)
|
|||||||
/// \tparam C shift bit count
|
/// \tparam C shift bit count
|
||||||
/// \param vec the vector
|
/// \param vec the vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorRotateLeft rotates each element in a packed vector by bit count.
|
/// \details VecRotateLeft rotates each element in a packed vector by bit count.
|
||||||
template<unsigned int C>
|
template<unsigned int C>
|
||||||
inline uint32x4_p VectorRotateLeft(const uint32x4_p vec)
|
inline uint32x4_p VecRotateLeft(const uint32x4_p vec)
|
||||||
{
|
{
|
||||||
const uint32x4_p m = {C, C, C, C};
|
const uint32x4_p m = {C, C, C, C};
|
||||||
return vec_rl(vec, m);
|
return vec_rl(vec, m);
|
||||||
@ -773,9 +773,9 @@ inline uint32x4_p VectorRotateLeft(const uint32x4_p vec)
|
|||||||
/// \tparam C shift bit count
|
/// \tparam C shift bit count
|
||||||
/// \param vec the vector
|
/// \param vec the vector
|
||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \details VectorRotateRight rotates each element in a packed vector by bit count.
|
/// \details VecRotateRight rotates each element in a packed vector by bit count.
|
||||||
template<unsigned int C>
|
template<unsigned int C>
|
||||||
inline uint32x4_p VectorRotateRight(const uint32x4_p vec)
|
inline uint32x4_p VecRotateRight(const uint32x4_p vec)
|
||||||
{
|
{
|
||||||
const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
|
const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
|
||||||
return vec_rl(vec, m);
|
return vec_rl(vec, m);
|
||||||
@ -787,7 +787,7 @@ inline uint32x4_p VectorRotateRight(const uint32x4_p vec)
|
|||||||
/// \returns vector
|
/// \returns vector
|
||||||
/// \since Crypto++ 7.0
|
/// \since Crypto++ 7.0
|
||||||
template <class T>
|
template <class T>
|
||||||
inline T VectorSwapWords(const T vec)
|
inline T VecSwapWords(const T vec)
|
||||||
{
|
{
|
||||||
return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
|
return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
|
||||||
}
|
}
|
||||||
@ -796,34 +796,34 @@ inline T VectorSwapWords(const T vec)
|
|||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param val the vector
|
/// \param val the vector
|
||||||
/// \returns vector created from low dword
|
/// \returns vector created from low dword
|
||||||
/// \details VectorGetLow() extracts the low dword from a vector. The low dword
|
/// \details VecGetLow() extracts the low dword from a vector. The low dword
|
||||||
/// is composed of the least significant bits and occupies bytes 8 through 15
|
/// is composed of the least significant bits and occupies bytes 8 through 15
|
||||||
/// when viewed as a big endian array. The return vector is the same type as
|
/// when viewed as a big endian array. The return vector is the same type as
|
||||||
/// the original vector and padded with 0's in the most significant bit positions.
|
/// the original vector and padded with 0's in the most significant bit positions.
|
||||||
template <class T>
|
template <class T>
|
||||||
inline T VectorGetLow(const T val)
|
inline T VecGetLow(const T val)
|
||||||
{
|
{
|
||||||
//const T zero = {0};
|
//const T zero = {0};
|
||||||
//const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
|
//const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
|
||||||
//return (T)vec_perm(zero, val, mask);
|
//return (T)vec_perm(zero, val, mask);
|
||||||
return VectorShiftRightOctet<8>(VectorShiftLeftOctet<8>(val));
|
return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Extract a dword from a vector
|
/// \brief Extract a dword from a vector
|
||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param val the vector
|
/// \param val the vector
|
||||||
/// \returns vector created from high dword
|
/// \returns vector created from high dword
|
||||||
/// \details VectorGetHigh() extracts the high dword from a vector. The high dword
|
/// \details VecGetHigh() extracts the high dword from a vector. The high dword
|
||||||
/// is composed of the most significant bits and occupies bytes 0 through 7
|
/// is composed of the most significant bits and occupies bytes 0 through 7
|
||||||
/// when viewed as a big endian array. The return vector is the same type as
|
/// when viewed as a big endian array. The return vector is the same type as
|
||||||
/// the original vector and padded with 0's in the most significant bit positions.
|
/// the original vector and padded with 0's in the most significant bit positions.
|
||||||
template <class T>
|
template <class T>
|
||||||
inline T VectorGetHigh(const T val)
|
inline T VecGetHigh(const T val)
|
||||||
{
|
{
|
||||||
//const T zero = {0};
|
//const T zero = {0};
|
||||||
//const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
|
//const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
|
||||||
//return (T)vec_perm(zero, val, mask);
|
//return (T)vec_perm(zero, val, mask);
|
||||||
return VectorShiftRightOctet<8>(val);
|
return VecShiftRightOctet<8>(val);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Compare two vectors
|
/// \brief Compare two vectors
|
||||||
@ -833,7 +833,7 @@ inline T VectorGetHigh(const T val)
|
|||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \returns true if vec1 equals vec2, false otherwise
|
/// \returns true if vec1 equals vec2, false otherwise
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline bool VectorEqual(const T1 vec1, const T2 vec2)
|
inline bool VecEqual(const T1 vec1, const T2 vec2)
|
||||||
{
|
{
|
||||||
return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
|
return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
|
||||||
}
|
}
|
||||||
@ -845,7 +845,7 @@ inline bool VectorEqual(const T1 vec1, const T2 vec2)
|
|||||||
/// \param vec2 the second vector
|
/// \param vec2 the second vector
|
||||||
/// \returns true if vec1 does not equal vec2, false otherwise
|
/// \returns true if vec1 does not equal vec2, false otherwise
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline bool VectorNotEqual(const T1 vec1, const T2 vec2)
|
inline bool VecNotEqual(const T1 vec1, const T2 vec2)
|
||||||
{
|
{
|
||||||
return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
|
return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
|
||||||
}
|
}
|
||||||
@ -859,11 +859,11 @@ inline bool VectorNotEqual(const T1 vec1, const T2 vec2)
|
|||||||
/// \tparam T2 vector type
|
/// \tparam T2 vector type
|
||||||
/// \param state the state vector
|
/// \param state the state vector
|
||||||
/// \param key the subkey vector
|
/// \param key the subkey vector
|
||||||
/// \details VectorEncrypt performs one round of AES encryption of state
|
/// \details VecEncrypt performs one round of AES encryption of state
|
||||||
/// using subkey key. The return vector is the same type as vec1.
|
/// using subkey key. The return vector is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorEncrypt(const T1 state, const T2 key)
|
inline T1 VecEncrypt(const T1 state, const T2 key)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
|
return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
|
||||||
@ -879,11 +879,11 @@ inline T1 VectorEncrypt(const T1 state, const T2 key)
|
|||||||
/// \tparam T2 vector type
|
/// \tparam T2 vector type
|
||||||
/// \param state the state vector
|
/// \param state the state vector
|
||||||
/// \param key the subkey vector
|
/// \param key the subkey vector
|
||||||
/// \details VectorEncryptLast performs the final round of AES encryption
|
/// \details VecEncryptLast performs the final round of AES encryption
|
||||||
/// of state using subkey key. The return vector is the same type as vec1.
|
/// of state using subkey key. The return vector is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorEncryptLast(const T1 state, const T2 key)
|
inline T1 VecEncryptLast(const T1 state, const T2 key)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
|
return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
|
||||||
@ -899,11 +899,11 @@ inline T1 VectorEncryptLast(const T1 state, const T2 key)
|
|||||||
/// \tparam T2 vector type
|
/// \tparam T2 vector type
|
||||||
/// \param state the state vector
|
/// \param state the state vector
|
||||||
/// \param key the subkey vector
|
/// \param key the subkey vector
|
||||||
/// \details VectorDecrypt performs one round of AES decryption of state
|
/// \details VecDecrypt performs one round of AES decryption of state
|
||||||
/// using subkey key. The return vector is the same type as vec1.
|
/// using subkey key. The return vector is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorDecrypt(const T1 state, const T2 key)
|
inline T1 VecDecrypt(const T1 state, const T2 key)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
|
return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
|
||||||
@ -919,11 +919,11 @@ inline T1 VectorDecrypt(const T1 state, const T2 key)
|
|||||||
/// \tparam T2 vector type
|
/// \tparam T2 vector type
|
||||||
/// \param state the state vector
|
/// \param state the state vector
|
||||||
/// \param key the subkey vector
|
/// \param key the subkey vector
|
||||||
/// \details VectorDecryptLast performs the final round of AES decryption
|
/// \details VecDecryptLast performs the final round of AES decryption
|
||||||
/// of state using subkey key. The return vector is the same type as vec1.
|
/// of state using subkey key. The return vector is the same type as vec1.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <class T1, class T2>
|
template <class T1, class T2>
|
||||||
inline T1 VectorDecryptLast(const T1 state, const T2 key)
|
inline T1 VecDecryptLast(const T1 state, const T2 key)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
|
return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
|
||||||
@ -939,11 +939,11 @@ inline T1 VectorDecryptLast(const T1 state, const T2 key)
|
|||||||
/// \tparam subfunc sub-function
|
/// \tparam subfunc sub-function
|
||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param vec the block to transform
|
/// \param vec the block to transform
|
||||||
/// \details VectorSHA256 selects sigma0, sigma1, Sigma0, Sigma1 based on
|
/// \details VecSHA256 selects sigma0, sigma1, Sigma0, Sigma1 based on
|
||||||
/// func and subfunc. The return vector is the same type as vec.
|
/// func and subfunc. The return vector is the same type as vec.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <int func, int subfunc, class T>
|
template <int func, int subfunc, class T>
|
||||||
inline T VectorSHA256(const T vec)
|
inline T VecSHA256(const T vec)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (T)__vshasigmaw((uint32x4_p)vec, func, subfunc);
|
return (T)__vshasigmaw((uint32x4_p)vec, func, subfunc);
|
||||||
@ -959,11 +959,11 @@ inline T VectorSHA256(const T vec)
|
|||||||
/// \tparam subfunc sub-function
|
/// \tparam subfunc sub-function
|
||||||
/// \tparam T vector type
|
/// \tparam T vector type
|
||||||
/// \param vec the block to transform
|
/// \param vec the block to transform
|
||||||
/// \details VectorSHA512 selects sigma0, sigma1, Sigma0, Sigma1 based on
|
/// \details VecSHA512 selects sigma0, sigma1, Sigma0, Sigma1 based on
|
||||||
/// func and subfunc. The return vector is the same type as vec.
|
/// func and subfunc. The return vector is the same type as vec.
|
||||||
/// \since Crypto++ 6.0
|
/// \since Crypto++ 6.0
|
||||||
template <int func, int subfunc, class T>
|
template <int func, int subfunc, class T>
|
||||||
inline T VectorSHA512(const T vec)
|
inline T VecSHA512(const T vec)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (T)__vshasigmad((uint64x2_p)vec, func, subfunc);
|
return (T)__vshasigmad((uint64x2_p)vec, func, subfunc);
|
||||||
|
@ -214,12 +214,12 @@ bool CPU_ProbePower8()
|
|||||||
#if defined(__xlc__) || defined(__xlC__)
|
#if defined(__xlc__) || defined(__xlC__)
|
||||||
const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1);
|
const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1);
|
||||||
const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2);
|
const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2);
|
||||||
const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add
|
const uint64x2_p v3 = VecAdd(v1, v2); // 64-bit add
|
||||||
vec_xst((uint8x16_p)v3, 0, (byte*)w3);
|
vec_xst((uint8x16_p)v3, 0, (byte*)w3);
|
||||||
#else
|
#else
|
||||||
const uint64x2_p v1 = (uint64x2_p)vec_vsx_ld(0, (byte*)w1);
|
const uint64x2_p v1 = (uint64x2_p)vec_vsx_ld(0, (byte*)w1);
|
||||||
const uint64x2_p v2 = (uint64x2_p)vec_vsx_ld(0, (byte*)w2);
|
const uint64x2_p v2 = (uint64x2_p)vec_vsx_ld(0, (byte*)w2);
|
||||||
const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add
|
const uint64x2_p v3 = VecAdd(v1, v2); // 64-bit add
|
||||||
vec_vsx_st((uint8x16_p)v3, 0, (byte*)w3);
|
vec_vsx_st((uint8x16_p)v3, 0, (byte*)w3);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -265,13 +265,13 @@ bool CPU_ProbeAES()
|
|||||||
0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08};
|
0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08};
|
||||||
byte r[16] = {255}, z[16] = {};
|
byte r[16] = {255}, z[16] = {};
|
||||||
|
|
||||||
uint8x16_p k = (uint8x16_p)VectorLoad(0, key);
|
uint8x16_p k = (uint8x16_p)VecLoad(0, key);
|
||||||
uint8x16_p s = (uint8x16_p)VectorLoad(0, state);
|
uint8x16_p s = (uint8x16_p)VecLoad(0, state);
|
||||||
s = VectorEncrypt(s, k);
|
s = VecEncrypt(s, k);
|
||||||
s = VectorEncryptLast(s, k);
|
s = VecEncryptLast(s, k);
|
||||||
s = VectorDecrypt(s, k);
|
s = VecDecrypt(s, k);
|
||||||
s = VectorDecryptLast(s, k);
|
s = VecDecryptLast(s, k);
|
||||||
VectorStore(s, r);
|
VecStore(s, r);
|
||||||
|
|
||||||
result = (0 != std::memcmp(r, z, 16));
|
result = (0 != std::memcmp(r, z, 16));
|
||||||
}
|
}
|
||||||
@ -697,17 +697,17 @@ static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, un
|
|||||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
uint32x4_p k = VectorLoad(keys);
|
uint32x4_p k = VecLoad(keys);
|
||||||
block = VectorXor(block, k);
|
block = VecXor(block, k);
|
||||||
|
|
||||||
for (size_t i=1; i<rounds-1; i+=2)
|
for (size_t i=1; i<rounds-1; i+=2)
|
||||||
{
|
{
|
||||||
block = VectorEncrypt(block, VectorLoad( i*16, keys));
|
block = VecEncrypt(block, VecLoad( i*16, keys));
|
||||||
block = VectorEncrypt(block, VectorLoad((i+1)*16, keys));
|
block = VecEncrypt(block, VecLoad((i+1)*16, keys));
|
||||||
}
|
}
|
||||||
|
|
||||||
block = VectorEncrypt(block, VectorLoad((rounds-1)*16, keys));
|
block = VecEncrypt(block, VecLoad((rounds-1)*16, keys));
|
||||||
block = VectorEncryptLast(block, VectorLoad(rounds*16, keys));
|
block = VecEncryptLast(block, VecLoad(rounds*16, keys));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -717,32 +717,32 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
uint32x4_p k = VectorLoad(keys);
|
uint32x4_p k = VecLoad(keys);
|
||||||
block0 = VectorXor(block0, k);
|
block0 = VecXor(block0, k);
|
||||||
block1 = VectorXor(block1, k);
|
block1 = VecXor(block1, k);
|
||||||
block2 = VectorXor(block2, k);
|
block2 = VecXor(block2, k);
|
||||||
block3 = VectorXor(block3, k);
|
block3 = VecXor(block3, k);
|
||||||
block4 = VectorXor(block4, k);
|
block4 = VecXor(block4, k);
|
||||||
block5 = VectorXor(block5, k);
|
block5 = VecXor(block5, k);
|
||||||
|
|
||||||
for (size_t i=1; i<rounds; ++i)
|
for (size_t i=1; i<rounds; ++i)
|
||||||
{
|
{
|
||||||
k = VectorLoad(i*16, keys);
|
k = VecLoad(i*16, keys);
|
||||||
block0 = VectorEncrypt(block0, k);
|
block0 = VecEncrypt(block0, k);
|
||||||
block1 = VectorEncrypt(block1, k);
|
block1 = VecEncrypt(block1, k);
|
||||||
block2 = VectorEncrypt(block2, k);
|
block2 = VecEncrypt(block2, k);
|
||||||
block3 = VectorEncrypt(block3, k);
|
block3 = VecEncrypt(block3, k);
|
||||||
block4 = VectorEncrypt(block4, k);
|
block4 = VecEncrypt(block4, k);
|
||||||
block5 = VectorEncrypt(block5, k);
|
block5 = VecEncrypt(block5, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
k = VectorLoad(rounds*16, keys);
|
k = VecLoad(rounds*16, keys);
|
||||||
block0 = VectorEncryptLast(block0, k);
|
block0 = VecEncryptLast(block0, k);
|
||||||
block1 = VectorEncryptLast(block1, k);
|
block1 = VecEncryptLast(block1, k);
|
||||||
block2 = VectorEncryptLast(block2, k);
|
block2 = VecEncryptLast(block2, k);
|
||||||
block3 = VectorEncryptLast(block3, k);
|
block3 = VecEncryptLast(block3, k);
|
||||||
block4 = VectorEncryptLast(block4, k);
|
block4 = VecEncryptLast(block4, k);
|
||||||
block5 = VectorEncryptLast(block5, k);
|
block5 = VecEncryptLast(block5, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
|
static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
|
||||||
@ -750,17 +750,17 @@ static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, un
|
|||||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
uint32x4_p k = VectorLoad(rounds*16, keys);
|
uint32x4_p k = VecLoad(rounds*16, keys);
|
||||||
block = VectorXor(block, k);
|
block = VecXor(block, k);
|
||||||
|
|
||||||
for (size_t i=rounds-1; i>1; i-=2)
|
for (size_t i=rounds-1; i>1; i-=2)
|
||||||
{
|
{
|
||||||
block = VectorDecrypt(block, VectorLoad( i*16, keys));
|
block = VecDecrypt(block, VecLoad( i*16, keys));
|
||||||
block = VectorDecrypt(block, VectorLoad((i-1)*16, keys));
|
block = VecDecrypt(block, VecLoad((i-1)*16, keys));
|
||||||
}
|
}
|
||||||
|
|
||||||
block = VectorDecrypt(block, VectorLoad(16, keys));
|
block = VecDecrypt(block, VecLoad(16, keys));
|
||||||
block = VectorDecryptLast(block, VectorLoad(0, keys));
|
block = VecDecryptLast(block, VecLoad(0, keys));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -770,32 +770,32 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
uint32x4_p k = VectorLoad(rounds*16, keys);
|
uint32x4_p k = VecLoad(rounds*16, keys);
|
||||||
block0 = VectorXor(block0, k);
|
block0 = VecXor(block0, k);
|
||||||
block1 = VectorXor(block1, k);
|
block1 = VecXor(block1, k);
|
||||||
block2 = VectorXor(block2, k);
|
block2 = VecXor(block2, k);
|
||||||
block3 = VectorXor(block3, k);
|
block3 = VecXor(block3, k);
|
||||||
block4 = VectorXor(block4, k);
|
block4 = VecXor(block4, k);
|
||||||
block5 = VectorXor(block5, k);
|
block5 = VecXor(block5, k);
|
||||||
|
|
||||||
for (size_t i=rounds-1; i>0; --i)
|
for (size_t i=rounds-1; i>0; --i)
|
||||||
{
|
{
|
||||||
k = VectorLoad(i*16, keys);
|
k = VecLoad(i*16, keys);
|
||||||
block0 = VectorDecrypt(block0, k);
|
block0 = VecDecrypt(block0, k);
|
||||||
block1 = VectorDecrypt(block1, k);
|
block1 = VecDecrypt(block1, k);
|
||||||
block2 = VectorDecrypt(block2, k);
|
block2 = VecDecrypt(block2, k);
|
||||||
block3 = VectorDecrypt(block3, k);
|
block3 = VecDecrypt(block3, k);
|
||||||
block4 = VectorDecrypt(block4, k);
|
block4 = VecDecrypt(block4, k);
|
||||||
block5 = VectorDecrypt(block5, k);
|
block5 = VecDecrypt(block5, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
k = VectorLoad(0, keys);
|
k = VecLoad(0, keys);
|
||||||
block0 = VectorDecryptLast(block0, k);
|
block0 = VecDecryptLast(block0, k);
|
||||||
block1 = VectorDecryptLast(block1, k);
|
block1 = VecDecryptLast(block1, k);
|
||||||
block2 = VectorDecryptLast(block2, k);
|
block2 = VecDecryptLast(block2, k);
|
||||||
block3 = VectorDecryptLast(block3, k);
|
block3 = VecDecryptLast(block3, k);
|
||||||
block4 = VectorDecryptLast(block4, k);
|
block4 = VecDecryptLast(block4, k);
|
||||||
block5 = VectorDecryptLast(block5, k);
|
block5 = VecDecryptLast(block5, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
ANONYMOUS_NAMESPACE_END
|
ANONYMOUS_NAMESPACE_END
|
||||||
@ -851,14 +851,14 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32*
|
|||||||
{
|
{
|
||||||
const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rkey);
|
const uint8x16_p d1 = vec_vsx_ld( 0, (uint8_t*)rkey);
|
||||||
const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rkey);
|
const uint8x16_p d2 = vec_vsx_ld(16, (uint8_t*)rkey);
|
||||||
vec_vsx_st(vec_perm(d1, zero, mask), 0, (uint8_t*)rkey);
|
vec_vsx_st(VecPermute(d1, zero, mask), 0, (uint8_t*)rkey);
|
||||||
vec_vsx_st(vec_perm(d2, zero, mask), 16, (uint8_t*)rkey);
|
vec_vsx_st(VecPermute(d2, zero, mask), 16, (uint8_t*)rkey);
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( ; i<rounds+1; i++, rkey+=4)
|
for ( ; i<rounds+1; i++, rkey+=4)
|
||||||
{
|
{
|
||||||
const uint8x16_p d = vec_vsx_ld( 0, (uint8_t*)rkey);
|
const uint8x16_p d = vec_vsx_ld( 0, (uint8_t*)rkey);
|
||||||
vec_vsx_st(vec_perm(d, zero, mask), 0, (uint8_t*)rkey);
|
vec_vsx_st(VecPermute(d, zero, mask), 0, (uint8_t*)rkey);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
290
sha_simd.cpp
290
sha_simd.cpp
@ -224,11 +224,11 @@ bool CPU_ProbeSHA256()
|
|||||||
byte r[16], z[16] = {0};
|
byte r[16], z[16] = {0};
|
||||||
uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
|
uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
|
||||||
|
|
||||||
x = VectorSHA256<0,0>(x);
|
x = VecSHA256<0,0>(x);
|
||||||
x = VectorSHA256<0,1>(x);
|
x = VecSHA256<0,1>(x);
|
||||||
x = VectorSHA256<1,0>(x);
|
x = VecSHA256<1,0>(x);
|
||||||
x = VectorSHA256<1,1>(x);
|
x = VecSHA256<1,1>(x);
|
||||||
VectorStore(x, r);
|
VecStore(x, r);
|
||||||
|
|
||||||
result = (0 == std::memcmp(r, z, 16));
|
result = (0 == std::memcmp(r, z, 16));
|
||||||
}
|
}
|
||||||
@ -268,11 +268,11 @@ bool CPU_ProbeSHA512()
|
|||||||
byte r[16], z[16] = {0};
|
byte r[16], z[16] = {0};
|
||||||
uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
|
uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
|
||||||
|
|
||||||
x = VectorSHA512<0,0>(x);
|
x = VecSHA512<0,0>(x);
|
||||||
x = VectorSHA512<0,1>(x);
|
x = VecSHA512<0,1>(x);
|
||||||
x = VectorSHA512<1,0>(x);
|
x = VecSHA512<1,0>(x);
|
||||||
x = VectorSHA512<1,1>(x);
|
x = VecSHA512<1,1>(x);
|
||||||
VectorStore(x, r);
|
VecStore(x, r);
|
||||||
|
|
||||||
result = (0 == std::memcmp(r, z, 16));
|
result = (0 == std::memcmp(r, z, 16));
|
||||||
}
|
}
|
||||||
@ -1091,7 +1091,7 @@ typedef __vector unsigned long long uint64x2_p8;
|
|||||||
|
|
||||||
// Unaligned load
|
// Unaligned load
|
||||||
template <class T> static inline
|
template <class T> static inline
|
||||||
uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
|
uint32x4_p8 VecLoad32x4u(const T* data, int offset)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (uint32x4_p8)vec_xl(offset, (uint8_t*)data);
|
return (uint32x4_p8)vec_xl(offset, (uint8_t*)data);
|
||||||
@ -1102,7 +1102,7 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
|
|||||||
|
|
||||||
// Unaligned store
|
// Unaligned store
|
||||||
template <class T> static inline
|
template <class T> static inline
|
||||||
void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
|
void VecStore32x4u(const uint32x4_p8 val, T* data, int offset)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
|
vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
|
||||||
@ -1114,14 +1114,14 @@ void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
|
|||||||
// Unaligned load of a user message. The load is big-endian,
|
// Unaligned load of a user message. The load is big-endian,
|
||||||
// and then the message is permuted for 32-bit words.
|
// and then the message is permuted for 32-bit words.
|
||||||
template <class T> static inline
|
template <class T> static inline
|
||||||
uint32x4_p8 VectorLoadMsg32x4(const T* data, int offset)
|
uint32x4_p8 VecLoadMsg32x4(const T* data, int offset)
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_LITTLE_ENDIAN)
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
||||||
const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
||||||
const uint32x4_p8 r = VectorLoad32x4u(data, offset);
|
const uint32x4_p8 r = VecLoad32x4u(data, offset);
|
||||||
return (uint32x4_p8)vec_perm(r, r, mask);
|
return (uint32x4_p8)VecPermute(r, r, mask);
|
||||||
#else
|
#else
|
||||||
return VectorLoad32x4u(data, offset);
|
return VecLoad32x4u(data, offset);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1136,7 +1136,7 @@ static inline
|
|||||||
uint32x4_p8 VectorMaj(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p8 z)
|
uint32x4_p8 VectorMaj(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p8 z)
|
||||||
{
|
{
|
||||||
// The trick below is due to Andy Polyakov and Jack Lloyd
|
// The trick below is due to Andy Polyakov and Jack Lloyd
|
||||||
return vec_sel(y, z, vec_xor(x, y));
|
return vec_sel(y, z, VecXor(x, y));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
@ -1185,7 +1185,7 @@ uint32x4_p8 VectorPack(const uint32x4_p8 a, const uint32x4_p8 b,
|
|||||||
{
|
{
|
||||||
const uint8x16_p8 m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
|
const uint8x16_p8 m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
|
||||||
const uint8x16_p8 m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
const uint8x16_p8 m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
||||||
return vec_perm(vec_perm(a,b,m1), vec_perm(c,d,m1), m2);
|
return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int R> static inline
|
template <unsigned int R> static inline
|
||||||
@ -1231,8 +1231,8 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t
|
|||||||
const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
|
const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
|
||||||
const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
|
const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
|
||||||
|
|
||||||
uint32x4_p8 abcd = VectorLoad32x4u(state+0, 0);
|
uint32x4_p8 abcd = VecLoad32x4u(state+0, 0);
|
||||||
uint32x4_p8 efgh = VectorLoad32x4u(state+4, 0);
|
uint32x4_p8 efgh = VecLoad32x4u(state+4, 0);
|
||||||
uint32x4_p8 W[16], S[8], vm, vk;
|
uint32x4_p8 W[16], S[8], vm, vk;
|
||||||
|
|
||||||
size_t blocks = length / SHA256::BLOCKSIZE;
|
size_t blocks = length / SHA256::BLOCKSIZE;
|
||||||
@ -1241,80 +1241,80 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t
|
|||||||
unsigned int offset=0;
|
unsigned int offset=0;
|
||||||
|
|
||||||
S[A] = abcd; S[E] = efgh;
|
S[A] = abcd; S[E] = efgh;
|
||||||
S[B] = VectorShiftLeftOctet<4>(S[A]);
|
S[B] = VecShiftLeftOctet<4>(S[A]);
|
||||||
S[F] = VectorShiftLeftOctet<4>(S[E]);
|
S[F] = VecShiftLeftOctet<4>(S[E]);
|
||||||
S[C] = VectorShiftLeftOctet<4>(S[B]);
|
S[C] = VecShiftLeftOctet<4>(S[B]);
|
||||||
S[G] = VectorShiftLeftOctet<4>(S[F]);
|
S[G] = VecShiftLeftOctet<4>(S[F]);
|
||||||
S[D] = VectorShiftLeftOctet<4>(S[C]);
|
S[D] = VecShiftLeftOctet<4>(S[C]);
|
||||||
S[H] = VectorShiftLeftOctet<4>(S[G]);
|
S[H] = VecShiftLeftOctet<4>(S[G]);
|
||||||
|
|
||||||
// Rounds 0-16
|
// Rounds 0-16
|
||||||
vk = VectorLoad32x4u(k, offset);
|
vk = VecLoad32x4u(k, offset);
|
||||||
vm = VectorLoadMsg32x4(m, offset);
|
vm = VecLoadMsg32x4(m, offset);
|
||||||
SHA256_ROUND1<0>(W,S, vk,vm);
|
SHA256_ROUND1<0>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<1>(W,S, vk,vm);
|
SHA256_ROUND1<1>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<2>(W,S, vk,vm);
|
SHA256_ROUND1<2>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<3>(W,S, vk,vm);
|
SHA256_ROUND1<3>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad32x4u(k, offset);
|
vk = VecLoad32x4u(k, offset);
|
||||||
vm = VectorLoadMsg32x4(m, offset);
|
vm = VecLoadMsg32x4(m, offset);
|
||||||
SHA256_ROUND1<4>(W,S, vk,vm);
|
SHA256_ROUND1<4>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<5>(W,S, vk,vm);
|
SHA256_ROUND1<5>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<6>(W,S, vk,vm);
|
SHA256_ROUND1<6>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<7>(W,S, vk,vm);
|
SHA256_ROUND1<7>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad32x4u(k, offset);
|
vk = VecLoad32x4u(k, offset);
|
||||||
vm = VectorLoadMsg32x4(m, offset);
|
vm = VecLoadMsg32x4(m, offset);
|
||||||
SHA256_ROUND1<8>(W,S, vk,vm);
|
SHA256_ROUND1<8>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<9>(W,S, vk,vm);
|
SHA256_ROUND1<9>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<10>(W,S, vk,vm);
|
SHA256_ROUND1<10>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<11>(W,S, vk,vm);
|
SHA256_ROUND1<11>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad32x4u(k, offset);
|
vk = VecLoad32x4u(k, offset);
|
||||||
vm = VectorLoadMsg32x4(m, offset);
|
vm = VecLoadMsg32x4(m, offset);
|
||||||
SHA256_ROUND1<12>(W,S, vk,vm);
|
SHA256_ROUND1<12>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<13>(W,S, vk,vm);
|
SHA256_ROUND1<13>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<14>(W,S, vk,vm);
|
SHA256_ROUND1<14>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<4>(vk);
|
vk = VecShiftLeftOctet<4>(vk);
|
||||||
vm = VectorShiftLeftOctet<4>(vm);
|
vm = VecShiftLeftOctet<4>(vm);
|
||||||
SHA256_ROUND1<15>(W,S, vk,vm);
|
SHA256_ROUND1<15>(W,S, vk,vm);
|
||||||
|
|
||||||
m += 16; // 32-bit words, not bytes
|
m += 16; // 32-bit words, not bytes
|
||||||
@ -1322,32 +1322,32 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t
|
|||||||
// Rounds 16-64
|
// Rounds 16-64
|
||||||
for (unsigned int i=16; i<64; i+=16)
|
for (unsigned int i=16; i<64; i+=16)
|
||||||
{
|
{
|
||||||
vk = VectorLoad32x4u(k, offset);
|
vk = VecLoad32x4u(k, offset);
|
||||||
SHA256_ROUND2<0>(W,S, vk);
|
SHA256_ROUND2<0>(W,S, vk);
|
||||||
SHA256_ROUND2<1>(W,S, VectorShiftLeftOctet<4>(vk));
|
SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
|
||||||
SHA256_ROUND2<2>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
SHA256_ROUND2<3>(W,S, VectorShiftLeftOctet<12>(vk));
|
SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad32x4u(k, offset);
|
vk = VecLoad32x4u(k, offset);
|
||||||
SHA256_ROUND2<4>(W,S, vk);
|
SHA256_ROUND2<4>(W,S, vk);
|
||||||
SHA256_ROUND2<5>(W,S, VectorShiftLeftOctet<4>(vk));
|
SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
|
||||||
SHA256_ROUND2<6>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
SHA256_ROUND2<7>(W,S, VectorShiftLeftOctet<12>(vk));
|
SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad32x4u(k, offset);
|
vk = VecLoad32x4u(k, offset);
|
||||||
SHA256_ROUND2<8>(W,S, vk);
|
SHA256_ROUND2<8>(W,S, vk);
|
||||||
SHA256_ROUND2<9>(W,S, VectorShiftLeftOctet<4>(vk));
|
SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
|
||||||
SHA256_ROUND2<10>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
SHA256_ROUND2<11>(W,S, VectorShiftLeftOctet<12>(vk));
|
SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad32x4u(k, offset);
|
vk = VecLoad32x4u(k, offset);
|
||||||
SHA256_ROUND2<12>(W,S, vk);
|
SHA256_ROUND2<12>(W,S, vk);
|
||||||
SHA256_ROUND2<13>(W,S, VectorShiftLeftOctet<4>(vk));
|
SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
|
||||||
SHA256_ROUND2<14>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
SHA256_ROUND2<15>(W,S, VectorShiftLeftOctet<12>(vk));
|
SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1355,19 +1355,19 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t
|
|||||||
efgh += VectorPack(S[E],S[F],S[G],S[H]);
|
efgh += VectorPack(S[E],S[F],S[G],S[H]);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStore32x4u(abcd, state+0, 0);
|
VecStore32x4u(abcd, state+0, 0);
|
||||||
VectorStore32x4u(efgh, state+4, 0);
|
VecStore32x4u(efgh, state+4, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
uint64x2_p8 VectorPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask)
|
uint64x2_p8 VecPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask)
|
||||||
{
|
{
|
||||||
return (uint64x2_p8)vec_perm(val, val, mask);
|
return (uint64x2_p8)VecPermute(val, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unaligned load
|
// Unaligned load
|
||||||
template <class T> static inline
|
template <class T> static inline
|
||||||
uint64x2_p8 VectorLoad64x2u(const T* data, int offset)
|
uint64x2_p8 VecLoad64x2u(const T* data, int offset)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
return (uint64x2_p8)vec_xl(offset, (uint8_t*)data);
|
return (uint64x2_p8)vec_xl(offset, (uint8_t*)data);
|
||||||
@ -1378,7 +1378,7 @@ uint64x2_p8 VectorLoad64x2u(const T* data, int offset)
|
|||||||
|
|
||||||
// Unaligned store
|
// Unaligned store
|
||||||
template <class T> static inline
|
template <class T> static inline
|
||||||
void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset)
|
void VecStore64x2u(const uint64x2_p8 val, T* data, int offset)
|
||||||
{
|
{
|
||||||
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
|
||||||
vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
|
vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
|
||||||
@ -1390,13 +1390,13 @@ void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset)
|
|||||||
// Unaligned load of a user message. The load is big-endian,
|
// Unaligned load of a user message. The load is big-endian,
|
||||||
// and then the message is permuted for 32-bit words.
|
// and then the message is permuted for 32-bit words.
|
||||||
template <class T> static inline
|
template <class T> static inline
|
||||||
uint64x2_p8 VectorLoadMsg64x2(const T* data, int offset)
|
uint64x2_p8 VecLoadMsg64x2(const T* data, int offset)
|
||||||
{
|
{
|
||||||
#if (CRYPTOPP_LITTLE_ENDIAN)
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
||||||
const uint8x16_p8 mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
|
const uint8x16_p8 mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
|
||||||
return VectorPermute64x2(VectorLoad64x2u(data, offset), mask);
|
return VecPermute64x2(VecLoad64x2u(data, offset), mask);
|
||||||
#else
|
#else
|
||||||
return VectorLoad64x2u(data, offset);
|
return VecLoad64x2u(data, offset);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1411,7 +1411,7 @@ static inline
|
|||||||
uint64x2_p8 VectorMaj(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p8 z)
|
uint64x2_p8 VectorMaj(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p8 z)
|
||||||
{
|
{
|
||||||
// The trick below is due to Andy Polyakov and Jack Lloyd
|
// The trick below is due to Andy Polyakov and Jack Lloyd
|
||||||
return vec_sel(y, z, vec_xor(x, y));
|
return vec_sel(y, z, VecXor(x, y));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
@ -1458,7 +1458,7 @@ static inline
|
|||||||
uint64x2_p8 VectorPack(const uint64x2_p8 x, const uint64x2_p8 y)
|
uint64x2_p8 VectorPack(const uint64x2_p8 x, const uint64x2_p8 y)
|
||||||
{
|
{
|
||||||
const uint8x16_p8 m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
const uint8x16_p8 m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
||||||
return vec_perm(x,y,m);
|
return VecPermute(x,y,m);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int R> static inline
|
template <unsigned int R> static inline
|
||||||
@ -1504,10 +1504,10 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t
|
|||||||
const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
|
const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
|
||||||
const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
|
const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
|
||||||
|
|
||||||
uint64x2_p8 ab = VectorLoad64x2u(state+0, 0);
|
uint64x2_p8 ab = VecLoad64x2u(state+0, 0);
|
||||||
uint64x2_p8 cd = VectorLoad64x2u(state+2, 0);
|
uint64x2_p8 cd = VecLoad64x2u(state+2, 0);
|
||||||
uint64x2_p8 ef = VectorLoad64x2u(state+4, 0);
|
uint64x2_p8 ef = VecLoad64x2u(state+4, 0);
|
||||||
uint64x2_p8 gh = VectorLoad64x2u(state+6, 0);
|
uint64x2_p8 gh = VecLoad64x2u(state+6, 0);
|
||||||
uint64x2_p8 W[16], S[8], vm, vk;
|
uint64x2_p8 W[16], S[8], vm, vk;
|
||||||
|
|
||||||
size_t blocks = length / SHA512::BLOCKSIZE;
|
size_t blocks = length / SHA512::BLOCKSIZE;
|
||||||
@ -1517,82 +1517,82 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t
|
|||||||
|
|
||||||
S[A] = ab; S[C] = cd;
|
S[A] = ab; S[C] = cd;
|
||||||
S[E] = ef; S[G] = gh;
|
S[E] = ef; S[G] = gh;
|
||||||
S[B] = VectorShiftLeftOctet<8>(S[A]);
|
S[B] = VecShiftLeftOctet<8>(S[A]);
|
||||||
S[D] = VectorShiftLeftOctet<8>(S[C]);
|
S[D] = VecShiftLeftOctet<8>(S[C]);
|
||||||
S[F] = VectorShiftLeftOctet<8>(S[E]);
|
S[F] = VecShiftLeftOctet<8>(S[E]);
|
||||||
S[H] = VectorShiftLeftOctet<8>(S[G]);
|
S[H] = VecShiftLeftOctet<8>(S[G]);
|
||||||
|
|
||||||
// Rounds 0-16
|
// Rounds 0-16
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
vm = VectorLoadMsg64x2(m, offset);
|
vm = VecLoadMsg64x2(m, offset);
|
||||||
SHA512_ROUND1<0>(W,S, vk,vm);
|
SHA512_ROUND1<0>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<8>(vk);
|
vk = VecShiftLeftOctet<8>(vk);
|
||||||
vm = VectorShiftLeftOctet<8>(vm);
|
vm = VecShiftLeftOctet<8>(vm);
|
||||||
SHA512_ROUND1<1>(W,S, vk,vm);
|
SHA512_ROUND1<1>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
vm = VectorLoadMsg64x2(m, offset);
|
vm = VecLoadMsg64x2(m, offset);
|
||||||
SHA512_ROUND1<2>(W,S, vk,vm);
|
SHA512_ROUND1<2>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<8>(vk);
|
vk = VecShiftLeftOctet<8>(vk);
|
||||||
vm = VectorShiftLeftOctet<8>(vm);
|
vm = VecShiftLeftOctet<8>(vm);
|
||||||
SHA512_ROUND1<3>(W,S, vk,vm);
|
SHA512_ROUND1<3>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
vm = VectorLoadMsg64x2(m, offset);
|
vm = VecLoadMsg64x2(m, offset);
|
||||||
SHA512_ROUND1<4>(W,S, vk,vm);
|
SHA512_ROUND1<4>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<8>(vk);
|
vk = VecShiftLeftOctet<8>(vk);
|
||||||
vm = VectorShiftLeftOctet<8>(vm);
|
vm = VecShiftLeftOctet<8>(vm);
|
||||||
SHA512_ROUND1<5>(W,S, vk,vm);
|
SHA512_ROUND1<5>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
vm = VectorLoadMsg64x2(m, offset);
|
vm = VecLoadMsg64x2(m, offset);
|
||||||
SHA512_ROUND1<6>(W,S, vk,vm);
|
SHA512_ROUND1<6>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<8>(vk);
|
vk = VecShiftLeftOctet<8>(vk);
|
||||||
vm = VectorShiftLeftOctet<8>(vm);
|
vm = VecShiftLeftOctet<8>(vm);
|
||||||
SHA512_ROUND1<7>(W,S, vk,vm);
|
SHA512_ROUND1<7>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
vm = VectorLoadMsg64x2(m, offset);
|
vm = VecLoadMsg64x2(m, offset);
|
||||||
SHA512_ROUND1<8>(W,S, vk,vm);
|
SHA512_ROUND1<8>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<8>(vk);
|
vk = VecShiftLeftOctet<8>(vk);
|
||||||
vm = VectorShiftLeftOctet<8>(vm);
|
vm = VecShiftLeftOctet<8>(vm);
|
||||||
SHA512_ROUND1<9>(W,S, vk,vm);
|
SHA512_ROUND1<9>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
vm = VectorLoadMsg64x2(m, offset);
|
vm = VecLoadMsg64x2(m, offset);
|
||||||
SHA512_ROUND1<10>(W,S, vk,vm);
|
SHA512_ROUND1<10>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<8>(vk);
|
vk = VecShiftLeftOctet<8>(vk);
|
||||||
vm = VectorShiftLeftOctet<8>(vm);
|
vm = VecShiftLeftOctet<8>(vm);
|
||||||
SHA512_ROUND1<11>(W,S, vk,vm);
|
SHA512_ROUND1<11>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
vm = VectorLoadMsg64x2(m, offset);
|
vm = VecLoadMsg64x2(m, offset);
|
||||||
SHA512_ROUND1<12>(W,S, vk,vm);
|
SHA512_ROUND1<12>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<8>(vk);
|
vk = VecShiftLeftOctet<8>(vk);
|
||||||
vm = VectorShiftLeftOctet<8>(vm);
|
vm = VecShiftLeftOctet<8>(vm);
|
||||||
SHA512_ROUND1<13>(W,S, vk,vm);
|
SHA512_ROUND1<13>(W,S, vk,vm);
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
vm = VectorLoadMsg64x2(m, offset);
|
vm = VecLoadMsg64x2(m, offset);
|
||||||
SHA512_ROUND1<14>(W,S, vk,vm);
|
SHA512_ROUND1<14>(W,S, vk,vm);
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorShiftLeftOctet<8>(vk);
|
vk = VecShiftLeftOctet<8>(vk);
|
||||||
vm = VectorShiftLeftOctet<8>(vm);
|
vm = VecShiftLeftOctet<8>(vm);
|
||||||
SHA512_ROUND1<15>(W,S, vk,vm);
|
SHA512_ROUND1<15>(W,S, vk,vm);
|
||||||
|
|
||||||
m += 16; // 64-bit words, not bytes
|
m += 16; // 64-bit words, not bytes
|
||||||
@ -1600,44 +1600,44 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t
|
|||||||
// Rounds 16-80
|
// Rounds 16-80
|
||||||
for (unsigned int i=16; i<80; i+=16)
|
for (unsigned int i=16; i<80; i+=16)
|
||||||
{
|
{
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
SHA512_ROUND2<0>(W,S, vk);
|
SHA512_ROUND2<0>(W,S, vk);
|
||||||
SHA512_ROUND2<1>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
SHA512_ROUND2<2>(W,S, vk);
|
SHA512_ROUND2<2>(W,S, vk);
|
||||||
SHA512_ROUND2<3>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
SHA512_ROUND2<4>(W,S, vk);
|
SHA512_ROUND2<4>(W,S, vk);
|
||||||
SHA512_ROUND2<5>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
SHA512_ROUND2<6>(W,S, vk);
|
SHA512_ROUND2<6>(W,S, vk);
|
||||||
SHA512_ROUND2<7>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
SHA512_ROUND2<8>(W,S, vk);
|
SHA512_ROUND2<8>(W,S, vk);
|
||||||
SHA512_ROUND2<9>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
SHA512_ROUND2<10>(W,S, vk);
|
SHA512_ROUND2<10>(W,S, vk);
|
||||||
SHA512_ROUND2<11>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
SHA512_ROUND2<12>(W,S, vk);
|
SHA512_ROUND2<12>(W,S, vk);
|
||||||
SHA512_ROUND2<13>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
|
|
||||||
vk = VectorLoad64x2u(k, offset);
|
vk = VecLoad64x2u(k, offset);
|
||||||
SHA512_ROUND2<14>(W,S, vk);
|
SHA512_ROUND2<14>(W,S, vk);
|
||||||
SHA512_ROUND2<15>(W,S, VectorShiftLeftOctet<8>(vk));
|
SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
|
||||||
offset+=16;
|
offset+=16;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1647,10 +1647,10 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t
|
|||||||
gh += VectorPack(S[G],S[H]);
|
gh += VectorPack(S[G],S[H]);
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStore64x2u(ab, state+0, 0);
|
VecStore64x2u(ab, state+0, 0);
|
||||||
VectorStore64x2u(cd, state+2, 0);
|
VecStore64x2u(cd, state+2, 0);
|
||||||
VectorStore64x2u(ef, state+4, 0);
|
VecStore64x2u(ef, state+4, 0);
|
||||||
VectorStore64x2u(gh, state+6, 0);
|
VecStore64x2u(gh, state+6, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_POWER8_SHA_AVAILABLE
|
#endif // CRYPTOPP_POWER8_SHA_AVAILABLE
|
||||||
|
@ -548,8 +548,9 @@ using CryptoPP::uint8x16_p;
|
|||||||
using CryptoPP::uint32x4_p;
|
using CryptoPP::uint32x4_p;
|
||||||
using CryptoPP::uint64x2_p;
|
using CryptoPP::uint64x2_p;
|
||||||
|
|
||||||
using CryptoPP::VectorAnd;
|
using CryptoPP::VecAnd;
|
||||||
using CryptoPP::VectorXor;
|
using CryptoPP::VecXor;
|
||||||
|
using CryptoPP::VecPermute;
|
||||||
|
|
||||||
// Rotate left by bit count
|
// Rotate left by bit count
|
||||||
template<unsigned int C>
|
template<unsigned int C>
|
||||||
@ -569,8 +570,8 @@ CRYPTOPP_INLINE uint64x2_p RotateRight64(const uint64x2_p val)
|
|||||||
|
|
||||||
CRYPTOPP_INLINE uint64x2_p SIMON128_f(const uint64x2_p val)
|
CRYPTOPP_INLINE uint64x2_p SIMON128_f(const uint64x2_p val)
|
||||||
{
|
{
|
||||||
return VectorXor(RotateLeft64<2>(val),
|
return VecXor(RotateLeft64<2>(val),
|
||||||
VectorAnd(RotateLeft64<1>(val), RotateLeft64<8>(val)));
|
VecAnd(RotateLeft64<1>(val), RotateLeft64<8>(val)));
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
|
CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
|
||||||
@ -584,22 +585,22 @@ CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1);
|
uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
|
||||||
uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2);
|
uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
|
||||||
|
|
||||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
|
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk1);
|
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
|
||||||
|
|
||||||
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
|
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
|
||||||
x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk2);
|
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
|
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk);
|
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
|
||||||
std::swap(x1, y1);
|
std::swap(x1, y1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -612,7 +613,7 @@ CRYPTOPP_INLINE void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block = (uint32x4_p)vec_perm(x1, y1, m3);
|
block = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
|
CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
|
||||||
@ -626,24 +627,24 @@ CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1);
|
uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
|
||||||
uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2);
|
uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
std::swap(x1, y1);
|
std::swap(x1, y1);
|
||||||
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
|
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
|
||||||
y1 = VectorXor(VectorXor(y1, rk), SIMON128_f(x1));
|
y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
|
||||||
rounds--;
|
rounds--;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
|
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
|
||||||
x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk1);
|
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
|
||||||
|
|
||||||
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
|
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk2);
|
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -655,7 +656,7 @@ CRYPTOPP_INLINE void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block = (uint32x4_p)vec_perm(x1, y1, m3);
|
block = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -671,32 +672,32 @@ CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1);
|
uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
|
||||||
uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2);
|
uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
|
||||||
uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1);
|
uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
|
||||||
uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2);
|
uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
|
||||||
uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1);
|
uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
|
||||||
uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2);
|
uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
|
||||||
|
|
||||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
|
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk1);
|
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
|
||||||
y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk1);
|
y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk1);
|
||||||
y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk1);
|
y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk1);
|
||||||
|
|
||||||
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
|
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
|
||||||
x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk2);
|
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
|
||||||
x2 = VectorXor(VectorXor(x2, SIMON128_f(y2)), rk2);
|
x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk2);
|
||||||
x3 = VectorXor(VectorXor(x3, SIMON128_f(y3)), rk2);
|
x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
|
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk);
|
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
|
||||||
y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk);
|
y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk);
|
||||||
y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk);
|
y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk);
|
||||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -709,12 +710,12 @@ CRYPTOPP_INLINE void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
|
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||||
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
|
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||||
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
|
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -730,34 +731,34 @@ CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1);
|
uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
|
||||||
uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2);
|
uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
|
||||||
uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1);
|
uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
|
||||||
uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2);
|
uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
|
||||||
uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1);
|
uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
|
||||||
uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2);
|
uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||||
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
|
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
|
||||||
y1 = VectorXor(VectorXor(y1, rk), SIMON128_f(x1));
|
y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
|
||||||
y2 = VectorXor(VectorXor(y2, rk), SIMON128_f(x2));
|
y2 = VecXor(VecXor(y2, rk), SIMON128_f(x2));
|
||||||
y3 = VectorXor(VectorXor(y3, rk), SIMON128_f(x3));
|
y3 = VecXor(VecXor(y3, rk), SIMON128_f(x3));
|
||||||
rounds--;
|
rounds--;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
|
const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
|
||||||
x1 = VectorXor(VectorXor(x1, SIMON128_f(y1)), rk1);
|
x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
|
||||||
x2 = VectorXor(VectorXor(x2, SIMON128_f(y2)), rk1);
|
x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk1);
|
||||||
x3 = VectorXor(VectorXor(x3, SIMON128_f(y3)), rk1);
|
x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk1);
|
||||||
|
|
||||||
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
|
const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON128_f(x1)), rk2);
|
y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
|
||||||
y2 = VectorXor(VectorXor(y2, SIMON128_f(x2)), rk2);
|
y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk2);
|
||||||
y3 = VectorXor(VectorXor(y3, SIMON128_f(x3)), rk2);
|
y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -769,12 +770,12 @@ CRYPTOPP_INLINE void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
|
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||||
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
|
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||||
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
|
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_POWER8_AVAILABLE
|
#endif // CRYPTOPP_POWER8_AVAILABLE
|
||||||
|
173
simon64_simd.cpp
173
simon64_simd.cpp
@ -538,10 +538,11 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
|||||||
using CryptoPP::uint8x16_p;
|
using CryptoPP::uint8x16_p;
|
||||||
using CryptoPP::uint32x4_p;
|
using CryptoPP::uint32x4_p;
|
||||||
|
|
||||||
using CryptoPP::VectorAnd;
|
using CryptoPP::VecAnd;
|
||||||
using CryptoPP::VectorXor;
|
using CryptoPP::VecXor;
|
||||||
using CryptoPP::VectorLoad;
|
using CryptoPP::VecLoad;
|
||||||
using CryptoPP::VectorLoadBE;
|
using CryptoPP::VecLoadBE;
|
||||||
|
using CryptoPP::VecPermute;
|
||||||
|
|
||||||
// Rotate left by bit count
|
// Rotate left by bit count
|
||||||
template<unsigned int C>
|
template<unsigned int C>
|
||||||
@ -561,8 +562,8 @@ CRYPTOPP_INLINE uint32x4_p RotateRight32(const uint32x4_p val)
|
|||||||
|
|
||||||
CRYPTOPP_INLINE uint32x4_p SIMON64_f(const uint32x4_p val)
|
CRYPTOPP_INLINE uint32x4_p SIMON64_f(const uint32x4_p val)
|
||||||
{
|
{
|
||||||
return VectorXor(RotateLeft32<2>(val),
|
return VecXor(RotateLeft32<2>(val),
|
||||||
VectorAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
|
VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -577,8 +578,8 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
uint32x4_p x1 = vec_perm(block0, block1, m1);
|
uint32x4_p x1 = VecPermute(block0, block1, m1);
|
||||||
uint32x4_p y1 = vec_perm(block0, block1, m2);
|
uint32x4_p y1 = VecPermute(block0, block1, m2);
|
||||||
|
|
||||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||||
{
|
{
|
||||||
@ -587,13 +588,13 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
|
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
|
||||||
#else
|
#else
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk1 = VectorLoad(subkeys+i);
|
uint32x4_p rk1 = VecLoad(subkeys+i);
|
||||||
uint32x4_p rk2 = VectorLoad(subkeys+i+1);
|
uint32x4_p rk2 = VecLoad(subkeys+i+1);
|
||||||
rk1 = vec_perm(rk1, rk1, m);
|
rk1 = VecPermute(rk1, rk1, m);
|
||||||
rk2 = vec_perm(rk2, rk2, m);
|
rk2 = VecPermute(rk2, rk2, m);
|
||||||
#endif
|
#endif
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
|
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
|
||||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2);
|
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
@ -602,10 +603,10 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||||
#else
|
#else
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk = VectorLoad(subkeys+rounds-1);
|
uint32x4_p rk = VecLoad(subkeys+rounds-1);
|
||||||
rk = vec_perm(rk, rk, m);
|
rk = VecPermute(rk, rk, m);
|
||||||
#endif
|
#endif
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk);
|
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
|
||||||
std::swap(x1, y1);
|
std::swap(x1, y1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -618,8 +619,8 @@ CRYPTOPP_INLINE void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -634,8 +635,8 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
uint32x4_p x1 = vec_perm(block0, block1, m1);
|
uint32x4_p x1 = VecPermute(block0, block1, m1);
|
||||||
uint32x4_p y1 = vec_perm(block0, block1, m2);
|
uint32x4_p y1 = VecPermute(block0, block1, m2);
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
@ -644,10 +645,10 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||||
#else
|
#else
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk = VectorLoad(subkeys+rounds-1);
|
uint32x4_p rk = VecLoad(subkeys+rounds-1);
|
||||||
rk = vec_perm(rk, rk, m);
|
rk = VecPermute(rk, rk, m);
|
||||||
#endif
|
#endif
|
||||||
y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1));
|
y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
|
||||||
rounds--;
|
rounds--;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -658,13 +659,13 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
||||||
#else
|
#else
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk1 = VectorLoad(subkeys+i+1);
|
uint32x4_p rk1 = VecLoad(subkeys+i+1);
|
||||||
uint32x4_p rk2 = VectorLoad(subkeys+i);
|
uint32x4_p rk2 = VecLoad(subkeys+i);
|
||||||
rk1 = vec_perm(rk1, rk1, m);
|
rk1 = VecPermute(rk1, rk1, m);
|
||||||
rk2 = vec_perm(rk2, rk2, m);
|
rk2 = VecPermute(rk2, rk2, m);
|
||||||
#endif
|
#endif
|
||||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
|
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2);
|
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -676,8 +677,8 @@ CRYPTOPP_INLINE void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -693,12 +694,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1);
|
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
|
||||||
uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2);
|
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
|
||||||
uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1);
|
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
|
||||||
uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2);
|
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
|
||||||
uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1);
|
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
|
||||||
uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2);
|
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
|
||||||
|
|
||||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||||
{
|
{
|
||||||
@ -707,18 +708,18 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
|
|||||||
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
|
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
|
||||||
#else
|
#else
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk1 = VectorLoad(subkeys+i);
|
uint32x4_p rk1 = VecLoad(subkeys+i);
|
||||||
uint32x4_p rk2 = VectorLoad(subkeys+i+1);
|
uint32x4_p rk2 = VecLoad(subkeys+i+1);
|
||||||
rk1 = vec_perm(rk1, rk1, m);
|
rk1 = VecPermute(rk1, rk1, m);
|
||||||
rk2 = vec_perm(rk2, rk2, m);
|
rk2 = VecPermute(rk2, rk2, m);
|
||||||
#endif
|
#endif
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
|
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
|
||||||
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk1);
|
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
|
||||||
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk1);
|
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
|
||||||
|
|
||||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2);
|
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
|
||||||
x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk2);
|
x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
|
||||||
x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk2);
|
x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
@ -727,12 +728,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
|
|||||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||||
#else
|
#else
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk = VectorLoad(subkeys+rounds-1);
|
uint32x4_p rk = VecLoad(subkeys+rounds-1);
|
||||||
rk = vec_perm(rk, rk, m);
|
rk = VecPermute(rk, rk, m);
|
||||||
#endif
|
#endif
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk);
|
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
|
||||||
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk);
|
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
|
||||||
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk);
|
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
|
||||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -745,12 +746,12 @@ CRYPTOPP_INLINE void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
|
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||||
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
|
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||||
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
|
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -766,12 +767,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1);
|
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
|
||||||
uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2);
|
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
|
||||||
uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1);
|
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
|
||||||
uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2);
|
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
|
||||||
uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1);
|
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
|
||||||
uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2);
|
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
@ -781,12 +782,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
|
|||||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||||
#else
|
#else
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk = VectorLoad(subkeys+rounds-1);
|
uint32x4_p rk = VecLoad(subkeys+rounds-1);
|
||||||
rk = vec_perm(rk, rk, m);
|
rk = VecPermute(rk, rk, m);
|
||||||
#endif
|
#endif
|
||||||
y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1));
|
y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
|
||||||
y2 = VectorXor(VectorXor(y2, rk), SIMON64_f(x2));
|
y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
|
||||||
y3 = VectorXor(VectorXor(y3, rk), SIMON64_f(x3));
|
y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
|
||||||
rounds--;
|
rounds--;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -797,18 +798,18 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
|
|||||||
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
||||||
#else
|
#else
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk1 = VectorLoad(subkeys+i+1);
|
uint32x4_p rk1 = VecLoad(subkeys+i+1);
|
||||||
uint32x4_p rk2 = VectorLoad(subkeys+i);
|
uint32x4_p rk2 = VecLoad(subkeys+i);
|
||||||
rk1 = vec_perm(rk1, rk1, m);
|
rk1 = VecPermute(rk1, rk1, m);
|
||||||
rk2 = vec_perm(rk2, rk2, m);
|
rk2 = VecPermute(rk2, rk2, m);
|
||||||
#endif
|
#endif
|
||||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
|
x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
|
||||||
x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk1);
|
x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
|
||||||
x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk1);
|
x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
|
||||||
|
|
||||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2);
|
y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
|
||||||
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk2);
|
y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
|
||||||
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk2);
|
y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -820,12 +821,12 @@ CRYPTOPP_INLINE void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
|
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||||
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
|
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||||
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
|
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
@ -479,9 +479,10 @@ using CryptoPP::uint8x16_p;
|
|||||||
using CryptoPP::uint32x4_p;
|
using CryptoPP::uint32x4_p;
|
||||||
using CryptoPP::uint64x2_p;
|
using CryptoPP::uint64x2_p;
|
||||||
|
|
||||||
using CryptoPP::VectorAdd;
|
using CryptoPP::VecAdd;
|
||||||
using CryptoPP::VectorSub;
|
using CryptoPP::VecSub;
|
||||||
using CryptoPP::VectorXor;
|
using CryptoPP::VecXor;
|
||||||
|
using CryptoPP::VecPermute;
|
||||||
|
|
||||||
// Rotate left by bit count
|
// Rotate left by bit count
|
||||||
template<unsigned int C>
|
template<unsigned int C>
|
||||||
@ -510,19 +511,19 @@ void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1);
|
uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
|
||||||
uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2);
|
uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
|
||||||
|
|
||||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
|
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
|
||||||
|
|
||||||
x1 = RotateRight64<8>(x1);
|
x1 = RotateRight64<8>(x1);
|
||||||
x1 = VectorAdd(x1, y1);
|
x1 = VecAdd(x1, y1);
|
||||||
x1 = VectorXor(x1, rk);
|
x1 = VecXor(x1, rk);
|
||||||
|
|
||||||
y1 = RotateLeft64<3>(y1);
|
y1 = RotateLeft64<3>(y1);
|
||||||
y1 = VectorXor(y1, x1);
|
y1 = VecXor(y1, x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -534,7 +535,7 @@ void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block = (uint32x4_p)vec_perm(x1, y1, m3);
|
block = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
|
void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
|
||||||
@ -548,17 +549,17 @@ void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint64x2_p x1 = (uint64x2_p)vec_perm(block, block, m1);
|
uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
|
||||||
uint64x2_p y1 = (uint64x2_p)vec_perm(block, block, m2);
|
uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
|
||||||
|
|
||||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
|
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
|
||||||
|
|
||||||
y1 = VectorXor(y1, x1);
|
y1 = VecXor(y1, x1);
|
||||||
y1 = RotateRight64<3>(y1);
|
y1 = RotateRight64<3>(y1);
|
||||||
x1 = VectorXor(x1, rk);
|
x1 = VecXor(x1, rk);
|
||||||
x1 = VectorSub(x1, y1);
|
x1 = VecSub(x1, y1);
|
||||||
x1 = RotateLeft64<8>(x1);
|
x1 = RotateLeft64<8>(x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -571,7 +572,7 @@ void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int r
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block = (uint32x4_p)vec_perm(x1, y1, m3);
|
block = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -587,12 +588,12 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1);
|
uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
|
||||||
uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2);
|
uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
|
||||||
uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1);
|
uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
|
||||||
uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2);
|
uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
|
||||||
uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1);
|
uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
|
||||||
uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2);
|
uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
|
||||||
|
|
||||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||||
{
|
{
|
||||||
@ -601,19 +602,19 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
x1 = RotateRight64<8>(x1);
|
x1 = RotateRight64<8>(x1);
|
||||||
x2 = RotateRight64<8>(x2);
|
x2 = RotateRight64<8>(x2);
|
||||||
x3 = RotateRight64<8>(x3);
|
x3 = RotateRight64<8>(x3);
|
||||||
x1 = VectorAdd(x1, y1);
|
x1 = VecAdd(x1, y1);
|
||||||
x2 = VectorAdd(x2, y2);
|
x2 = VecAdd(x2, y2);
|
||||||
x3 = VectorAdd(x3, y3);
|
x3 = VecAdd(x3, y3);
|
||||||
x1 = VectorXor(x1, rk);
|
x1 = VecXor(x1, rk);
|
||||||
x2 = VectorXor(x2, rk);
|
x2 = VecXor(x2, rk);
|
||||||
x3 = VectorXor(x3, rk);
|
x3 = VecXor(x3, rk);
|
||||||
|
|
||||||
y1 = RotateLeft64<3>(y1);
|
y1 = RotateLeft64<3>(y1);
|
||||||
y2 = RotateLeft64<3>(y2);
|
y2 = RotateLeft64<3>(y2);
|
||||||
y3 = RotateLeft64<3>(y3);
|
y3 = RotateLeft64<3>(y3);
|
||||||
y1 = VectorXor(y1, x1);
|
y1 = VecXor(y1, x1);
|
||||||
y2 = VectorXor(y2, x2);
|
y2 = VecXor(y2, x2);
|
||||||
y3 = VectorXor(y3, x3);
|
y3 = VecXor(y3, x3);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -625,12 +626,12 @@ void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
|
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||||
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
|
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||||
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
|
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -646,30 +647,30 @@ void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
|
||||||
uint64x2_p x1 = (uint64x2_p)vec_perm(block0, block1, m1);
|
uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
|
||||||
uint64x2_p y1 = (uint64x2_p)vec_perm(block0, block1, m2);
|
uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
|
||||||
uint64x2_p x2 = (uint64x2_p)vec_perm(block2, block3, m1);
|
uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
|
||||||
uint64x2_p y2 = (uint64x2_p)vec_perm(block2, block3, m2);
|
uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
|
||||||
uint64x2_p x3 = (uint64x2_p)vec_perm(block4, block5, m1);
|
uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
|
||||||
uint64x2_p y3 = (uint64x2_p)vec_perm(block4, block5, m2);
|
uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
|
||||||
|
|
||||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
|
const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
|
||||||
|
|
||||||
y1 = VectorXor(y1, x1);
|
y1 = VecXor(y1, x1);
|
||||||
y2 = VectorXor(y2, x2);
|
y2 = VecXor(y2, x2);
|
||||||
y3 = VectorXor(y3, x3);
|
y3 = VecXor(y3, x3);
|
||||||
y1 = RotateRight64<3>(y1);
|
y1 = RotateRight64<3>(y1);
|
||||||
y2 = RotateRight64<3>(y2);
|
y2 = RotateRight64<3>(y2);
|
||||||
y3 = RotateRight64<3>(y3);
|
y3 = RotateRight64<3>(y3);
|
||||||
|
|
||||||
x1 = VectorXor(x1, rk);
|
x1 = VecXor(x1, rk);
|
||||||
x2 = VectorXor(x2, rk);
|
x2 = VecXor(x2, rk);
|
||||||
x3 = VectorXor(x3, rk);
|
x3 = VecXor(x3, rk);
|
||||||
x1 = VectorSub(x1, y1);
|
x1 = VecSub(x1, y1);
|
||||||
x2 = VectorSub(x2, y2);
|
x2 = VecSub(x2, y2);
|
||||||
x3 = VectorSub(x3, y3);
|
x3 = VecSub(x3, y3);
|
||||||
x1 = RotateLeft64<8>(x1);
|
x1 = RotateLeft64<8>(x1);
|
||||||
x2 = RotateLeft64<8>(x2);
|
x2 = RotateLeft64<8>(x2);
|
||||||
x3 = RotateLeft64<8>(x3);
|
x3 = RotateLeft64<8>(x3);
|
||||||
@ -684,12 +685,12 @@ void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
|
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||||
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
|
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||||
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
|
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_POWER8_AVAILABLE
|
#endif // CRYPTOPP_POWER8_AVAILABLE
|
||||||
|
137
speck64_simd.cpp
137
speck64_simd.cpp
@ -483,10 +483,11 @@ CRYPTOPP_INLINE void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
|||||||
using CryptoPP::uint8x16_p;
|
using CryptoPP::uint8x16_p;
|
||||||
using CryptoPP::uint32x4_p;
|
using CryptoPP::uint32x4_p;
|
||||||
|
|
||||||
using CryptoPP::VectorAdd;
|
using CryptoPP::VecAdd;
|
||||||
using CryptoPP::VectorSub;
|
using CryptoPP::VecSub;
|
||||||
using CryptoPP::VectorXor;
|
using CryptoPP::VecXor;
|
||||||
using CryptoPP::VectorLoad;
|
using CryptoPP::VecLoad;
|
||||||
|
using CryptoPP::VecPermute;
|
||||||
|
|
||||||
// Rotate left by bit count
|
// Rotate left by bit count
|
||||||
template<unsigned int C>
|
template<unsigned int C>
|
||||||
@ -516,8 +517,8 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
uint32x4_p x1 = vec_perm(block0, block1, m1);
|
uint32x4_p x1 = VecPermute(block0, block1, m1);
|
||||||
uint32x4_p y1 = vec_perm(block0, block1, m2);
|
uint32x4_p y1 = VecPermute(block0, block1, m2);
|
||||||
|
|
||||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||||
{
|
{
|
||||||
@ -526,16 +527,16 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#else
|
#else
|
||||||
// subkeys has extra elements so memory backs the last subkey
|
// subkeys has extra elements so memory backs the last subkey
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk = VectorLoad(subkeys+i);
|
uint32x4_p rk = VecLoad(subkeys+i);
|
||||||
rk = vec_perm(rk, rk, m);
|
rk = VecPermute(rk, rk, m);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
x1 = RotateRight32<8>(x1);
|
x1 = RotateRight32<8>(x1);
|
||||||
x1 = VectorAdd(x1, y1);
|
x1 = VecAdd(x1, y1);
|
||||||
x1 = VectorXor(x1, rk);
|
x1 = VecXor(x1, rk);
|
||||||
|
|
||||||
y1 = RotateLeft32<3>(y1);
|
y1 = RotateLeft32<3>(y1);
|
||||||
y1 = VectorXor(y1, x1);
|
y1 = VecXor(y1, x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -547,8 +548,8 @@ void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -563,8 +564,8 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
uint32x4_p x1 = vec_perm(block0, block1, m1);
|
uint32x4_p x1 = VecPermute(block0, block1, m1);
|
||||||
uint32x4_p y1 = vec_perm(block0, block1, m2);
|
uint32x4_p y1 = VecPermute(block0, block1, m2);
|
||||||
|
|
||||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
@ -573,15 +574,15 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#else
|
#else
|
||||||
// subkeys has extra elements so memory backs the last subkey
|
// subkeys has extra elements so memory backs the last subkey
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk = VectorLoad(subkeys+i);
|
uint32x4_p rk = VecLoad(subkeys+i);
|
||||||
rk = vec_perm(rk, rk, m);
|
rk = VecPermute(rk, rk, m);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
y1 = VectorXor(y1, x1);
|
y1 = VecXor(y1, x1);
|
||||||
y1 = RotateRight32<3>(y1);
|
y1 = RotateRight32<3>(y1);
|
||||||
|
|
||||||
x1 = VectorXor(x1, rk);
|
x1 = VecXor(x1, rk);
|
||||||
x1 = VectorSub(x1, y1);
|
x1 = VecSub(x1, y1);
|
||||||
x1 = RotateLeft32<8>(x1);
|
x1 = RotateLeft32<8>(x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -594,8 +595,8 @@ void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -611,12 +612,12 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1);
|
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
|
||||||
uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2);
|
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
|
||||||
uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1);
|
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
|
||||||
uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2);
|
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
|
||||||
uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1);
|
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
|
||||||
uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2);
|
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
|
||||||
|
|
||||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||||
{
|
{
|
||||||
@ -625,29 +626,29 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#else
|
#else
|
||||||
// subkeys has extra elements so memory backs the last subkey
|
// subkeys has extra elements so memory backs the last subkey
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk = VectorLoad(subkeys+i);
|
uint32x4_p rk = VecLoad(subkeys+i);
|
||||||
rk = vec_perm(rk, rk, m);
|
rk = VecPermute(rk, rk, m);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
x1 = RotateRight32<8>(x1);
|
x1 = RotateRight32<8>(x1);
|
||||||
x2 = RotateRight32<8>(x2);
|
x2 = RotateRight32<8>(x2);
|
||||||
x3 = RotateRight32<8>(x3);
|
x3 = RotateRight32<8>(x3);
|
||||||
|
|
||||||
x1 = VectorAdd(x1, y1);
|
x1 = VecAdd(x1, y1);
|
||||||
x2 = VectorAdd(x2, y2);
|
x2 = VecAdd(x2, y2);
|
||||||
x3 = VectorAdd(x3, y3);
|
x3 = VecAdd(x3, y3);
|
||||||
|
|
||||||
x1 = VectorXor(x1, rk);
|
x1 = VecXor(x1, rk);
|
||||||
x2 = VectorXor(x2, rk);
|
x2 = VecXor(x2, rk);
|
||||||
x3 = VectorXor(x3, rk);
|
x3 = VecXor(x3, rk);
|
||||||
|
|
||||||
y1 = RotateLeft32<3>(y1);
|
y1 = RotateLeft32<3>(y1);
|
||||||
y2 = RotateLeft32<3>(y2);
|
y2 = RotateLeft32<3>(y2);
|
||||||
y3 = RotateLeft32<3>(y3);
|
y3 = RotateLeft32<3>(y3);
|
||||||
|
|
||||||
y1 = VectorXor(y1, x1);
|
y1 = VecXor(y1, x1);
|
||||||
y2 = VectorXor(y2, x2);
|
y2 = VecXor(y2, x2);
|
||||||
y3 = VectorXor(y3, x3);
|
y3 = VecXor(y3, x3);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
@ -659,12 +660,12 @@ void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
|
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||||
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
|
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||||
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
|
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
@ -680,12 +681,12 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
uint32x4_p x1 = (uint32x4_p)vec_perm(block0, block1, m1);
|
uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
|
||||||
uint32x4_p y1 = (uint32x4_p)vec_perm(block0, block1, m2);
|
uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
|
||||||
uint32x4_p x2 = (uint32x4_p)vec_perm(block2, block3, m1);
|
uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
|
||||||
uint32x4_p y2 = (uint32x4_p)vec_perm(block2, block3, m2);
|
uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
|
||||||
uint32x4_p x3 = (uint32x4_p)vec_perm(block4, block5, m1);
|
uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
|
||||||
uint32x4_p y3 = (uint32x4_p)vec_perm(block4, block5, m2);
|
uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
|
||||||
|
|
||||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
@ -694,25 +695,25 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#else
|
#else
|
||||||
// subkeys has extra elements so memory backs the last subkey
|
// subkeys has extra elements so memory backs the last subkey
|
||||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||||
uint32x4_p rk = VectorLoad(subkeys+i);
|
uint32x4_p rk = VecLoad(subkeys+i);
|
||||||
rk = vec_perm(rk, rk, m);
|
rk = VecPermute(rk, rk, m);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
y1 = VectorXor(y1, x1);
|
y1 = VecXor(y1, x1);
|
||||||
y2 = VectorXor(y2, x2);
|
y2 = VecXor(y2, x2);
|
||||||
y3 = VectorXor(y3, x3);
|
y3 = VecXor(y3, x3);
|
||||||
|
|
||||||
y1 = RotateRight32<3>(y1);
|
y1 = RotateRight32<3>(y1);
|
||||||
y2 = RotateRight32<3>(y2);
|
y2 = RotateRight32<3>(y2);
|
||||||
y3 = RotateRight32<3>(y3);
|
y3 = RotateRight32<3>(y3);
|
||||||
|
|
||||||
x1 = VectorXor(x1, rk);
|
x1 = VecXor(x1, rk);
|
||||||
x2 = VectorXor(x2, rk);
|
x2 = VecXor(x2, rk);
|
||||||
x3 = VectorXor(x3, rk);
|
x3 = VecXor(x3, rk);
|
||||||
|
|
||||||
x1 = VectorSub(x1, y1);
|
x1 = VecSub(x1, y1);
|
||||||
x2 = VectorSub(x2, y2);
|
x2 = VecSub(x2, y2);
|
||||||
x3 = VectorSub(x3, y3);
|
x3 = VecSub(x3, y3);
|
||||||
|
|
||||||
x1 = RotateLeft32<8>(x1);
|
x1 = RotateLeft32<8>(x1);
|
||||||
x2 = RotateLeft32<8>(x2);
|
x2 = RotateLeft32<8>(x2);
|
||||||
@ -728,12 +729,12 @@ void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
block0 = (uint32x4_p)vec_perm(x1, y1, m3);
|
block0 = (uint32x4_p)VecPermute(x1, y1, m3);
|
||||||
block1 = (uint32x4_p)vec_perm(x1, y1, m4);
|
block1 = (uint32x4_p)VecPermute(x1, y1, m4);
|
||||||
block2 = (uint32x4_p)vec_perm(x2, y2, m3);
|
block2 = (uint32x4_p)VecPermute(x2, y2, m3);
|
||||||
block3 = (uint32x4_p)vec_perm(x2, y2, m4);
|
block3 = (uint32x4_p)VecPermute(x2, y2, m4);
|
||||||
block4 = (uint32x4_p)vec_perm(x3, y3, m3);
|
block4 = (uint32x4_p)VecPermute(x3, y3, m3);
|
||||||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
block5 = (uint32x4_p)VecPermute(x3, y3, m4);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
48
validat1.cpp
48
validat1.cpp
@ -1089,44 +1089,44 @@ bool TestAltivecOps()
|
|||||||
const byte st2[16] ={21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6};
|
const byte st2[16] ={21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6};
|
||||||
const byte st3[16] ={20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5};
|
const byte st3[16] ={20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5};
|
||||||
|
|
||||||
VectorStore(VectorLoad(src), dest);
|
VecStore(VecLoad(src), dest);
|
||||||
pass1 = (0 == std::memcmp(src, dest, 16)) && pass1;
|
pass1 = (0 == std::memcmp(src, dest, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
VectorStore(VectorLoad(src+1), dest+1);
|
VecStore(VecLoad(src+1), dest+1);
|
||||||
pass1 = (0 == std::memcmp(st1, dest+1, 16)) && pass1;
|
pass1 = (0 == std::memcmp(st1, dest+1, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
VectorStore(VectorLoad(src+2), dest+2);
|
VecStore(VecLoad(src+2), dest+2);
|
||||||
pass1 = (0 == std::memcmp(st2, dest+2, 16)) && pass1;
|
pass1 = (0 == std::memcmp(st2, dest+2, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
VectorStore(VectorLoad(src+3), dest+3);
|
VecStore(VecLoad(src+3), dest+3);
|
||||||
pass1 = (0 == std::memcmp(st3, dest+3, 16)) && pass1;
|
pass1 = (0 == std::memcmp(st3, dest+3, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
VectorStoreBE(VectorLoadBE(src), dest);
|
VecStoreBE(VecLoadBE(src), dest);
|
||||||
pass1 = (0 == std::memcmp(src, dest, 16)) && pass1;
|
pass1 = (0 == std::memcmp(src, dest, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
VectorStoreBE(VectorLoadBE(src+1), dest+1);
|
VecStoreBE(VecLoadBE(src+1), dest+1);
|
||||||
pass1 = (0 == std::memcmp(st1, dest+1, 16)) && pass1;
|
pass1 = (0 == std::memcmp(st1, dest+1, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
VectorStoreBE(VectorLoadBE(src+2), dest+2);
|
VecStoreBE(VecLoadBE(src+2), dest+2);
|
||||||
pass1 = (0 == std::memcmp(st2, dest+2, 16)) && pass1;
|
pass1 = (0 == std::memcmp(st2, dest+2, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
VectorStoreBE(VectorLoadBE(src+3), dest+3);
|
VecStoreBE(VecLoadBE(src+3), dest+3);
|
||||||
pass1 = (0 == std::memcmp(st3, dest+3, 16)) && pass1;
|
pass1 = (0 == std::memcmp(st3, dest+3, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
#if (CRYPTOPP_LITTLE_ENDIAN)
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
||||||
VectorStore(VectorLoadBE(src), dest);
|
VecStore(VecLoadBE(src), dest);
|
||||||
pass1 = (0 != std::memcmp(src, dest, 16)) && pass1;
|
pass1 = (0 != std::memcmp(src, dest, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
|
|
||||||
VectorStoreBE(VectorLoad(src), dest);
|
VecStoreBE(VecLoad(src), dest);
|
||||||
pass1 = (0 != std::memcmp(src, dest, 16)) && pass1;
|
pass1 = (0 != std::memcmp(src, dest, 16)) && pass1;
|
||||||
CRYPTOPP_ASSERT(pass1);
|
CRYPTOPP_ASSERT(pass1);
|
||||||
#endif
|
#endif
|
||||||
@ -1143,9 +1143,9 @@ bool TestAltivecOps()
|
|||||||
uint8x16_p val = {0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
|
uint8x16_p val = {0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
|
||||||
0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff};
|
0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff};
|
||||||
|
|
||||||
pass2 = (VectorEqual(val, VectorShiftLeftOctet<0>(val))) && pass2;
|
pass2 = (VecEqual(val, VecShiftLeftOctet<0>(val))) && pass2;
|
||||||
CRYPTOPP_ASSERT(pass2);
|
CRYPTOPP_ASSERT(pass2);
|
||||||
pass2 = (VectorEqual(val, VectorShiftRightOctet<0>(val))) && pass2;
|
pass2 = (VecEqual(val, VecShiftRightOctet<0>(val))) && pass2;
|
||||||
CRYPTOPP_ASSERT(pass2);
|
CRYPTOPP_ASSERT(pass2);
|
||||||
|
|
||||||
uint8x16_p lsh1 = {0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
|
uint8x16_p lsh1 = {0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
|
||||||
@ -1153,9 +1153,9 @@ bool TestAltivecOps()
|
|||||||
uint8x16_p rsh1 = {0x00,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
|
uint8x16_p rsh1 = {0x00,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
|
||||||
0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff};
|
0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff};
|
||||||
|
|
||||||
pass2 = (VectorEqual(lsh1, VectorShiftLeftOctet<1>(val))) && pass2;
|
pass2 = (VecEqual(lsh1, VecShiftLeftOctet<1>(val))) && pass2;
|
||||||
CRYPTOPP_ASSERT(pass2);
|
CRYPTOPP_ASSERT(pass2);
|
||||||
pass2 = (VectorEqual(rsh1, VectorShiftRightOctet<1>(val))) && pass2;
|
pass2 = (VecEqual(rsh1, VecShiftRightOctet<1>(val))) && pass2;
|
||||||
CRYPTOPP_ASSERT(pass2);
|
CRYPTOPP_ASSERT(pass2);
|
||||||
|
|
||||||
uint8x16_p lsh15 = {0xff,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
uint8x16_p lsh15 = {0xff,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
||||||
@ -1163,9 +1163,9 @@ bool TestAltivecOps()
|
|||||||
uint8x16_p rsh15 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
uint8x16_p rsh15 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
||||||
0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xff};
|
0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xff};
|
||||||
|
|
||||||
pass2 = (VectorEqual(lsh15, VectorShiftLeftOctet<15>(val))) && pass2;
|
pass2 = (VecEqual(lsh15, VecShiftLeftOctet<15>(val))) && pass2;
|
||||||
CRYPTOPP_ASSERT(pass2);
|
CRYPTOPP_ASSERT(pass2);
|
||||||
pass2 = (VectorEqual(rsh15, VectorShiftRightOctet<15>(val))) && pass2;
|
pass2 = (VecEqual(rsh15, VecShiftRightOctet<15>(val))) && pass2;
|
||||||
CRYPTOPP_ASSERT(pass2);
|
CRYPTOPP_ASSERT(pass2);
|
||||||
|
|
||||||
uint8x16_p lsh16 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
uint8x16_p lsh16 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
||||||
@ -1173,9 +1173,9 @@ bool TestAltivecOps()
|
|||||||
uint8x16_p rsh16 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
uint8x16_p rsh16 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
||||||
0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00};
|
0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00};
|
||||||
|
|
||||||
pass2 = (VectorEqual(lsh16, VectorShiftLeftOctet<16>(val))) && pass2;
|
pass2 = (VecEqual(lsh16, VecShiftLeftOctet<16>(val))) && pass2;
|
||||||
CRYPTOPP_ASSERT(pass2);
|
CRYPTOPP_ASSERT(pass2);
|
||||||
pass2 = (VectorEqual(rsh16, VectorShiftRightOctet<16>(val))) && pass2;
|
pass2 = (VecEqual(rsh16, VecShiftRightOctet<16>(val))) && pass2;
|
||||||
CRYPTOPP_ASSERT(pass2);
|
CRYPTOPP_ASSERT(pass2);
|
||||||
|
|
||||||
if (!pass2)
|
if (!pass2)
|
||||||
@ -1194,16 +1194,16 @@ bool TestAltivecOps()
|
|||||||
uint8x16_p ex3 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
uint8x16_p ex3 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
||||||
0x1f,0x1e,0x1d,0x1c, 0x1b,0x1a,0x19,0x18};
|
0x1f,0x1e,0x1d,0x1c, 0x1b,0x1a,0x19,0x18};
|
||||||
|
|
||||||
pass3 = VectorEqual(ex2, VectorGetLow(ex1)) && pass3;
|
pass3 = VecEqual(ex2, VecGetLow(ex1)) && pass3;
|
||||||
CRYPTOPP_ASSERT(pass3);
|
CRYPTOPP_ASSERT(pass3);
|
||||||
pass3 = VectorEqual(ex3, VectorGetHigh(ex1)) && pass3;
|
pass3 = VecEqual(ex3, VecGetHigh(ex1)) && pass3;
|
||||||
CRYPTOPP_ASSERT(pass3);
|
CRYPTOPP_ASSERT(pass3);
|
||||||
|
|
||||||
uint8x16_p ex4 = VectorShiftRightOctet<8>(VectorShiftLeftOctet<8>(ex1));
|
uint8x16_p ex4 = VecShiftRightOctet<8>(VecShiftLeftOctet<8>(ex1));
|
||||||
pass3 = VectorEqual(ex4, VectorGetLow(ex1)) && pass3;
|
pass3 = VecEqual(ex4, VecGetLow(ex1)) && pass3;
|
||||||
CRYPTOPP_ASSERT(pass3);
|
CRYPTOPP_ASSERT(pass3);
|
||||||
uint8x16_p ex5 = VectorShiftRightOctet<8>(ex1);
|
uint8x16_p ex5 = VecShiftRightOctet<8>(ex1);
|
||||||
pass3 = VectorEqual(ex5, VectorGetHigh(ex1)) && pass3;
|
pass3 = VecEqual(ex5, VecGetHigh(ex1)) && pass3;
|
||||||
CRYPTOPP_ASSERT(pass3);
|
CRYPTOPP_ASSERT(pass3);
|
||||||
|
|
||||||
if (!pass3)
|
if (!pass3)
|
||||||
|
Loading…
Reference in New Issue
Block a user