Make XTS mode parallelizable (GH #891)

On CoffeeLake performance increased from 3.4 cpb to 1.75 cpb. On Core2Duo performance increased from 27 cpb to 19 cpb.
This commit is contained in:
Jeffrey Walton 2019-10-13 16:17:37 -04:00
parent c9b8452d57
commit 8e8e95cea2
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 91 additions and 97 deletions

177
xts.cpp
View File

@ -1,18 +1,11 @@
// xts.cpp - written and placed in the public domain by Jeffrey Walton
//
// The best performance is achieved on machines with AES hardware acceleration.
// However, 64-bit machines without hardware acceleration profit the most with
// separate calls to ProcessBlock followed by XorBuffer rather than a single
// call to AdvancedProcessBlocks. That's because we did not parallelize, and
// XorBuffer uses SSE2 and ASIMD when available. Parallelizing slowed things
// down due to copying m_register for GF_Double. XorBuffer profits without
// AESNI and friends since XorBuffer only uses load, store and xor.
#include "pch.h"
#include "xts.h"
#include "misc.h"
#include "modes.h"
#include "cpu.h"
#if defined(CRYPTOPP_DEBUG)
# include "aes.h"
@ -40,41 +33,6 @@ ANONYMOUS_NAMESPACE_BEGIN
using namespace CryptoPP;
// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
// base architecture. We can use the SIMD code below without an
// architecture option. No runtime tests are required. Unfortunately,
// we can't use it on Altivec because an architecture switch is required.
// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
// 16-byte block sizes. count must be a multiple of 16 since SIMD words
// are used.
inline void XorBuffer(byte *buf, const byte *mask, size_t count)
{
CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
CRYPTOPP_UNUSED(count);
#if defined(__SSE2__) || defined(_M_X64)
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16)
_mm_storeu_si128(M128_CAST(buf+i), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(mask+i)), _mm_loadu_si128(CONST_M128_CAST(buf+i))));
#else
_mm_storeu_si128(M128_CAST(buf), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(mask)), _mm_loadu_si128(CONST_M128_CAST(buf))));
#endif
#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16)
vst1q_u8(buf+i, veorq_u8(vld1q_u8(mask+i), vld1q_u8(buf+i)));
#else
vst1q_u8(buf, veorq_u8(vld1q_u8(mask), vld1q_u8(buf)));
#endif
#else
xorbuf(buf, mask, count);
#endif
}
// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
// base architecture. We can use the SIMD code below without an
// architecture option. No runtime tests are required. Unfortunately,
@ -85,50 +43,47 @@ inline void XorBuffer(byte *buf, const byte *mask, size_t count)
inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
{
CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
CRYPTOPP_UNUSED(count);
#if defined(__SSE2__) || defined(_M_X64)
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16)
_mm_storeu_si128(M128_CAST(output+i), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(input+i)), _mm_loadu_si128(CONST_M128_CAST(mask+i))));
#else
_mm_storeu_si128(M128_CAST(output), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(input)), _mm_loadu_si128(CONST_M128_CAST(mask))));
#endif
_mm_storeu_si128(M128_CAST(output+i),
_mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(input+i)),
_mm_loadu_si128(CONST_M128_CAST(mask+i))));
#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16)
vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
#else
vst1q_u8(output, veorq_u8(vld1q_u8(input), vld1q_u8(mask)));
#endif
#else
xorbuf(output, input, mask, count);
#endif
}
inline void XorBuffer(byte *buf, const byte *mask, size_t count)
{
XorBuffer(buf, buf, mask, count);
}
// Borrowed from CMAC, but little-endian representation
inline void GF_Double(byte *k, unsigned int len)
inline void GF_Double(byte *out, const byte* in, unsigned int len)
{
#if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
word64 carry = 0, x;
for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
{
x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx);
x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
word64 y = (x >> 63); x = (x << 1) + carry;
PutWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx, x);
PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
carry = y;
}
#else
word32 carry = 0, x;
for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
{
x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx);
x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
word32 y = (x >> 31); x = (x << 1) + carry;
PutWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx, x);
PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
carry = y;
}
#endif
@ -139,6 +94,7 @@ inline void GF_Double(byte *k, unsigned int len)
CRYPTOPP_ASSERT(len >= 16);
CRYPTOPP_ASSERT(len <= 128);
byte* k = out;
if (carry)
{
switch (len)
@ -184,6 +140,7 @@ inline void GF_Double(byte *k, unsigned int len)
#else
CRYPTOPP_ASSERT(len == 16);
byte* k = out;
if (carry)
{
k[0] ^= 0x87;
@ -192,6 +149,11 @@ inline void GF_Double(byte *k, unsigned int len)
#endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
}
inline void GF_Double(byte *inout, unsigned int len)
{
GF_Double(inout, inout, len);
}
#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
using CryptoPP::AES;
@ -247,7 +209,8 @@ void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &
void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
{
BlockOrientedCipherModeBase::Resynchronize(iv, ivLength);
GetTweakCipher().ProcessBlock(m_register);
std::memcpy(m_xregister, m_register, ivLength);
GetTweakCipher().ProcessBlock(m_xregister);
}
void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
@ -257,37 +220,61 @@ void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
std::memset(iv+8, 0x00, iv.size()-8);
BlockOrientedCipherModeBase::Resynchronize(iv, iv.size());
GetTweakCipher().ProcessBlock(m_register);
std::memcpy(m_xregister, iv, iv.size());
GetTweakCipher().ProcessBlock(m_xregister);
}
void XTS_ModeBase::ResizeBuffers()
{
BlockOrientedCipherModeBase::ResizeBuffers();
m_workspace.New(GetBlockCipher().BlockSize());
m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
}
void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
{
const unsigned int blockSize = GetBlockCipher().BlockSize();
const size_t parallelSize = blockSize*ParallelBlocks;
size_t i = 0;
// data unit is multiple of 16 bytes
CRYPTOPP_ASSERT(length % blockSize == 0);
// now encrypt the data unit, AES_BLK_BYTES at a time
for (size_t i=0; i<length; i+=blockSize)
// encrypt the data unit, optimal size at a time
for ( ; i+parallelSize<=length; i+=parallelSize)
{
// m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
// merge the tweak into the input block
XorBuffer(m_xworkspace, inString+i, m_xregister, parallelSize);
// encrypt one block, merge the tweak into the output block
GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister, outString+i, parallelSize, BlockTransformation::BT_AllowParallel);
// Multiply T by alpha. m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
}
// encrypt the data unit, blocksize at a time
for ( ; i<length; i+=blockSize)
{
// merge the tweak into the input block
XorBuffer(m_workspace, inString+i, m_register, blockSize);
XorBuffer(m_xworkspace, inString+i, m_xregister, blockSize);
// encrypt one block
GetBlockCipher().ProcessBlock(m_workspace);
GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block
XorBuffer(outString+i, m_workspace, m_register, blockSize);
XorBuffer(outString+i, m_xworkspace, m_xregister, blockSize);
// Multiply T by alpha
GF_Double(m_register, m_register.size());
GF_Double(m_xregister, blockSize);
}
CRYPTOPP_ASSERT(i == length);
}
size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
@ -310,8 +297,8 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
CRYPTOPP_ASSERT(outLength >= inLength);
const unsigned int blockSize = GetBlockCipher().BlockSize();
const unsigned int blocks = inLength / blockSize;
const unsigned int tail = inLength % blockSize;
const size_t blocks = inLength / blockSize;
const size_t tail = inLength % blockSize;
outLength = inLength;
if (tail == 0)
@ -327,22 +314,22 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
ProcessData(outString, inString, inLength-head);
outString += head;
inString += head; inLength -= head;
inString += head; inLength -= head;
}
///// handle the full block /////
// merge the tweak into the input block
XorBuffer(m_workspace, inString, m_register, blockSize);
XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
// encrypt one block
GetBlockCipher().ProcessBlock(m_workspace);
GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block
XorBuffer(outString, m_workspace, m_register, blockSize);
XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
// Multiply T by alpha
GF_Double(m_register, m_register.size());
GF_Double(m_xregister, blockSize);
///// handle final partial block /////
@ -351,20 +338,20 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
const size_t len = inLength-blockSize;
// copy in the final plaintext bytes
std::memcpy(m_workspace, inString, len);
std::memcpy(m_xworkspace, inString, len);
// and copy out the final ciphertext bytes
std::memcpy(outString, outString-blockSize, len);
// "steal" ciphertext to complete the block
std::memcpy(m_workspace+len, outString-blockSize+len, blockSize-len);
std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
// merge the tweak into the input block
XorBuffer(m_workspace, m_register, blockSize);
XorBuffer(m_xworkspace, m_xregister, blockSize);
// encrypt one block
GetBlockCipher().ProcessBlock(m_workspace);
GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the previous output block
XorBuffer(outString-blockSize, m_workspace, m_register, blockSize);
XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
return outLength;
}
@ -375,8 +362,8 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
CRYPTOPP_ASSERT(outLength >= inLength);
const unsigned int blockSize = GetBlockCipher().BlockSize();
const unsigned int blocks = inLength / blockSize;
const unsigned int tail = inLength % blockSize;
const size_t blocks = inLength / blockSize;
const size_t tail = inLength % blockSize;
outLength = inLength;
if (tail == 0)
@ -392,12 +379,12 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
ProcessData(outString, inString, inLength-head);
outString += head;
inString += head; inLength -= head;
inString += head; inLength -= head;
}
SecByteBlock poly1(m_register);
SecByteBlock poly2(m_register);
GF_Double(poly2, poly2.size());
#define poly1 (m_xregister+0*blockSize)
#define poly2 (m_xregister+1*blockSize)
GF_Double(poly2, poly1, blockSize);
///// handle final partial block /////
@ -406,20 +393,20 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
const size_t len = inLength-blockSize;
// merge the tweak into the input block
XorBuffer(m_workspace, inString-blockSize, poly2, blockSize);
XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
// encrypt one block
GetBlockCipher().ProcessBlock(m_workspace);
GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block
XorBuffer(m_workspace, poly2, blockSize);
XorBuffer(m_xworkspace, poly2, blockSize);
// copy in the final plaintext bytes
std::memcpy(outString-blockSize, inString, len);
// and copy out the final ciphertext bytes
std::memcpy(outString, m_workspace, len);
std::memcpy(outString, m_xworkspace, len);
// "steal" ciphertext to complete the block
std::memcpy(outString-blockSize+len, m_workspace+len, blockSize-len);
std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
///// handle the full previous block /////
@ -427,13 +414,13 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
outString -= blockSize;
// merge the tweak into the input block
XorBuffer(m_workspace, outString, poly1, blockSize);
XorBuffer(m_xworkspace, outString, poly1, blockSize);
// encrypt one block
GetBlockCipher().ProcessBlock(m_workspace);
GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block
XorBuffer(outString, m_workspace, poly1, blockSize);
XorBuffer(outString, m_xworkspace, poly1, blockSize);
return outLength;
}

11
xts.h
View File

@ -49,6 +49,8 @@ NAMESPACE_BEGIN(CryptoPP)
class CRYPTOPP_NO_VTABLE XTS_ModeBase : public BlockOrientedCipherModeBase
{
public:
virtual ~XTS_ModeBase() {}
std::string AlgorithmName() const
{return GetBlockCipher().AlgorithmName() + "/XTS";}
std::string AlgorithmProvider() const
@ -70,6 +72,8 @@ public:
/// \return the block size of the cipher, in bytes
unsigned int BlockSize() const
{return GetBlockCipher().BlockSize();}
unsigned int GetOptimalBlockSize() const
{return GetBlockCipher().BlockSize()*ParallelBlocks;}
unsigned int MinLastBlockSize() const
{return GetBlockCipher().BlockSize()+1;}
unsigned int OptimalDataAlignment() const
@ -102,7 +106,10 @@ protected:
const BlockCipher& GetTweakCipher() const
{return const_cast<XTS_ModeBase*>(this)->AccessTweakCipher();}
SecByteBlock m_workspace;
SecByteBlock m_xregister;
SecByteBlock m_xworkspace;
enum {ParallelBlocks = 4};
};
/// \brief XTS block cipher mode of operation implementation details
@ -112,7 +119,7 @@ template <class CIPHER>
class CRYPTOPP_NO_VTABLE XTS_Final : public XTS_ModeBase
{
public:
static const char* CRYPTOPP_API StaticAlgorithmName()
CRYPTOPP_STATIC_CONSTEXPR const char* StaticAlgorithmName()
{return "XTS";}
protected: