Increase XTS parallel blocks on Aarch64 and PowerPC

This commit is contained in:
Jeffrey Walton 2020-04-11 16:20:25 -04:00
parent 80953e3fb5
commit ce532bdbc9
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 78 additions and 9 deletions

77
xts.cpp
View File

@ -248,37 +248,100 @@ void XTS_ModeBase::ResizeBuffers()
m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
}
// ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
// selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
// can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
// of registers. The unneeded code paths should be removed by optimizer.
// The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
{
// data unit is multiple of 16 bytes
CRYPTOPP_ASSERT(length % BlockSize() == 0);
enum { lastParallelBlock = ParallelBlocks-1 };
const unsigned int blockSize = GetBlockCipher().BlockSize();
const size_t parallelSize = blockSize*ParallelBlocks;
// encrypt the data unit, optimal size at a time
while (length >= parallelSize)
{
// If this fires the GF_Double'ing below is not in sync
CRYPTOPP_ASSERT(ParallelBlocks == 4);
// m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
if (ParallelBlocks > 4)
{
GF_Double(m_xregister+4*blockSize, m_xregister+3*blockSize, blockSize);
GF_Double(m_xregister+5*blockSize, m_xregister+4*blockSize, blockSize);
GF_Double(m_xregister+6*blockSize, m_xregister+5*blockSize, blockSize);
GF_Double(m_xregister+7*blockSize, m_xregister+6*blockSize, blockSize);
}
if (ParallelBlocks > 8)
{
GF_Double(m_xregister+8*blockSize, m_xregister+7*blockSize, blockSize);
GF_Double(m_xregister+9*blockSize, m_xregister+8*blockSize, blockSize);
GF_Double(m_xregister+10*blockSize, m_xregister+9*blockSize, blockSize);
GF_Double(m_xregister+11*blockSize, m_xregister+10*blockSize, blockSize);
}
// merge the tweak into the input block
XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);
// encrypt one block, merge the tweak into the output block
GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
outString, parallelSize, BlockTransformation::BT_AllowParallel);
// m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);
inString += parallelSize;
outString += parallelSize;
length -= parallelSize;
}
// encrypt the data unit, 4 blocks at a time
while (ParallelBlocks == 12 && length >= blockSize*4)
{
// m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
// merge the tweak into the input block
XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);
XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);
// encrypt one block, merge the tweak into the output block
GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister, outString, parallelSize, BlockTransformation::BT_AllowParallel);
GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
outString, blockSize*4, BlockTransformation::BT_AllowParallel);
// m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
inString += parallelSize;
outString += parallelSize;
length -= parallelSize;
inString += blockSize*4;
outString += blockSize*4;
length -= blockSize*4;
}
// encrypt the data unit, 2 blocks at a time
while (ParallelBlocks == 8 && length >= blockSize*2)
{
// m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
// merge the tweak into the input block
XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);
// encrypt one block, merge the tweak into the output block
GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
outString, blockSize*2, BlockTransformation::BT_AllowParallel);
// m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);
inString += blockSize*2;
outString += blockSize*2;
length -= blockSize*2;
}
// encrypt the data unit, blocksize at a time

10
xts.h
View File

@ -134,10 +134,16 @@ protected:
{return const_cast<XTS_ModeBase*>(this)->AccessTweakCipher();}
// Buffers are sized based on ParallelBlocks
SecByteBlock m_xregister;
SecByteBlock m_xworkspace;
AlignedSecByteBlock m_xregister;
AlignedSecByteBlock m_xworkspace;
// Intel lacks the SSE registers to run 8 or 12 parallel blocks.
// Do not change this value after compiling. It has no effect.
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
enum {ParallelBlocks = 4};
#else
enum {ParallelBlocks = 12};
#endif
};
/// \brief XTS block cipher mode of operation implementation