diff --git a/adv-simd.h b/adv-simd.h index 9cb2ec19..814f83e6 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -24,6 +24,7 @@ // * AdvancedProcessBlocks128_4x1_NEON // * AdvancedProcessBlocks128_6x2_NEON // * AdvancedProcessBlocks64_6x1_ALTIVEC +// * AdvancedProcessBlocks128_4x1_ALTIVEC // * AdvancedProcessBlocks128_6x1_ALTIVEC // // If an arrangement ends in 2, like 6x2, then the template will handle the @@ -1781,6 +1782,158 @@ NAMESPACE_END // CryptoPP NAMESPACE_BEGIN(CryptoPP) +/// \brief AdvancedProcessBlocks for 1 and 4 blocks +/// \tparam F1 function to process 1 128-bit block +/// \tparam F4 function to process 4 128-bit blocks +/// \tparam W word type of the subkey table +/// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words +/// at a time. +/// \details The subkey type is usually word32 or word64. F1 and F4 must use the +/// same word type. +template +inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, + const W *subKeys, size_t rounds, const byte *inBlocks, + const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 16); + +#if defined(CRYPTOPP_LITTLE_ENDIAN) + const uint32x4_p s_one = {1,0,0,0}; +#else + const uint32x4_p s_one = {0,0,0,1}; +#endif + + const size_t blockSize = 16; + // const size_t vsxBlockSize = 16; + + size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; + size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); + + if (flags & BT_ReverseDirection) + { + inBlocks = PtrAdd(inBlocks, length - blockSize); + xorBlocks = PtrAdd(xorBlocks, length - blockSize); + outBlocks = PtrAdd(outBlocks, length - blockSize); + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BT_AllowParallel) + { + while (length >= 4*blockSize) + { + uint32x4_p block0, block1, block2, block3; + + if (flags & BT_InBlockIsCounter) + { + block0 = VectorLoadBE(inBlocks); + block1 = VectorAdd(block0, s_one); + block2 = VectorAdd(block1, s_one); + block3 = VectorAdd(block2, s_one); + + // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE). + // CTR_ModePolicy::OperateKeystream is wired such that after + // returning from this function CTR_ModePolicy will detect wrap on + // on the last counter byte and increment the next to last byte. + // The problem is, with a big-endian load, inBlocks[15] is really + // located at index 15. The vector addition using a 32-bit element + // generates a carry into inBlocks[14] and then CTR_ModePolicy + // increments inBlocks[14] too. + // + // To find this bug we needed a test case with a ctr of 0xNN...FA. + // The last octet is 0xFA and adding 6 creates the wrap to trigger + // the issue. If the last octet was 0xFC then 4 would trigger it. + // We dumb-lucked into the test with SPECK-128. The test case of + // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA. + uint8x16_p temp = VectorAdd((uint8x16_p)block3, (uint8x16_p)s_one); + VectorStoreBE(temp, const_cast(inBlocks)); + } + else + { + block0 = VectorLoadBE(inBlocks); + inBlocks = PtrAdd(inBlocks, inIncrement); + block1 = VectorLoadBE(inBlocks); + inBlocks = PtrAdd(inBlocks, inIncrement); + block2 = VectorLoadBE(inBlocks); + inBlocks = PtrAdd(inBlocks, inIncrement); + block3 = VectorLoadBE(inBlocks); + inBlocks = PtrAdd(inBlocks, inIncrement); + } + + if (xorInput) + { + block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + } + + func4(block0, block1, block2, block3, subKeys, rounds); + + if (xorOutput) + { + block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + } + + VectorStoreBE(block0, outBlocks); + outBlocks = PtrAdd(outBlocks, outIncrement); + VectorStoreBE(block1, outBlocks); + outBlocks = PtrAdd(outBlocks, outIncrement); + VectorStoreBE(block2, outBlocks); + outBlocks = PtrAdd(outBlocks, outIncrement); + VectorStoreBE(block3, outBlocks); + outBlocks = PtrAdd(outBlocks, outIncrement); + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + uint32x4_p block = VectorLoadBE(inBlocks); + + if (xorInput) + block = VectorXor(block, VectorLoadBE(xorBlocks)); + + if (flags & BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subKeys, rounds); + + if (xorOutput) + block = VectorXor(block, VectorLoadBE(xorBlocks)); + + VectorStoreBE(block, outBlocks); + + inBlocks = PtrAdd(inBlocks, inIncrement); + outBlocks = PtrAdd(outBlocks, outIncrement); + xorBlocks = PtrAdd(xorBlocks, xorIncrement); + length -= blockSize; + } + + return length; +} + /// \brief AdvancedProcessBlocks for 1 and 6 blocks /// \tparam F1 function to process 1 128-bit block /// \tparam F6 function to process 6 128-bit blocks