Make XTS mode parallelizable (GH #891)

On CoffeeLake performance increased from 3.4 cpb to 1.75 cpb. On Core2Duo performance increased from 27 cpb to 19 cpb.
This commit is contained in:
Jeffrey Walton 2019-10-13 16:17:37 -04:00
parent c9b8452d57
commit 8e8e95cea2
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 91 additions and 97 deletions

177
xts.cpp
View File

@ -1,18 +1,11 @@
// xts.cpp - written and placed in the public domain by Jeffrey Walton // xts.cpp - written and placed in the public domain by Jeffrey Walton
//
// The best performance is achieved on machines with AES hardware acceleration.
// However, 64-bit machines without hardware acceleration profit the most with
// separate calls to ProcessBlock followed by XorBuffer rather than a single
// call to AdvancedProcessBlocks. That's because we did not parallelize, and
// XorBuffer uses SSE2 and ASIMD when available. Parallelizing slowed things
// down due to copying m_register for GF_Double. XorBuffer profits without
// AESNI and friends since XorBuffer only uses load, store and xor.
#include "pch.h" #include "pch.h"
#include "xts.h" #include "xts.h"
#include "misc.h" #include "misc.h"
#include "modes.h" #include "modes.h"
#include "cpu.h"
#if defined(CRYPTOPP_DEBUG) #if defined(CRYPTOPP_DEBUG)
# include "aes.h" # include "aes.h"
@ -40,41 +33,6 @@ ANONYMOUS_NAMESPACE_BEGIN
using namespace CryptoPP; using namespace CryptoPP;
// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
// base architecture. We can use the SIMD code below without an
// architecture option. No runtime tests are required. Unfortunately,
// we can't use it on Altivec because an architecture switch is required.
// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
// 16-byte block sizes. count must be a multiple of 16 since SIMD words
// are used.
inline void XorBuffer(byte *buf, const byte *mask, size_t count)
{
CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
CRYPTOPP_UNUSED(count);
#if defined(__SSE2__) || defined(_M_X64)
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16)
_mm_storeu_si128(M128_CAST(buf+i), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(mask+i)), _mm_loadu_si128(CONST_M128_CAST(buf+i))));
#else
_mm_storeu_si128(M128_CAST(buf), _mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(mask)), _mm_loadu_si128(CONST_M128_CAST(buf))));
#endif
#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16)
vst1q_u8(buf+i, veorq_u8(vld1q_u8(mask+i), vld1q_u8(buf+i)));
#else
vst1q_u8(buf, veorq_u8(vld1q_u8(mask), vld1q_u8(buf)));
#endif
#else
xorbuf(buf, mask, count);
#endif
}
// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the // Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
// base architecture. We can use the SIMD code below without an // base architecture. We can use the SIMD code below without an
// architecture option. No runtime tests are required. Unfortunately, // architecture option. No runtime tests are required. Unfortunately,
@ -85,50 +43,47 @@ inline void XorBuffer(byte *buf, const byte *mask, size_t count)
inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count) inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
{ {
CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0)); CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
CRYPTOPP_UNUSED(count);
#if defined(__SSE2__) || defined(_M_X64) #if defined(__SSE2__) || defined(_M_X64)
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16) for (size_t i=0; i<count; i+=16)
_mm_storeu_si128(M128_CAST(output+i), _mm_xor_si128( _mm_storeu_si128(M128_CAST(output+i),
_mm_loadu_si128(CONST_M128_CAST(input+i)), _mm_loadu_si128(CONST_M128_CAST(mask+i)))); _mm_xor_si128(
#else _mm_loadu_si128(CONST_M128_CAST(input+i)),
_mm_storeu_si128(M128_CAST(output), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(mask+i))));
_mm_loadu_si128(CONST_M128_CAST(input)), _mm_loadu_si128(CONST_M128_CAST(mask))));
#endif
#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64) #elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16) for (size_t i=0; i<count; i+=16)
vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i))); vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
#else
vst1q_u8(output, veorq_u8(vld1q_u8(input), vld1q_u8(mask)));
#endif
#else #else
xorbuf(output, input, mask, count); xorbuf(output, input, mask, count);
#endif #endif
} }
inline void XorBuffer(byte *buf, const byte *mask, size_t count)
{
XorBuffer(buf, buf, mask, count);
}
// Borrowed from CMAC, but little-endian representation // Borrowed from CMAC, but little-endian representation
inline void GF_Double(byte *k, unsigned int len) inline void GF_Double(byte *out, const byte* in, unsigned int len)
{ {
#if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__) #if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
word64 carry = 0, x; word64 carry = 0, x;
for (size_t i=0, idx=0; i<len/8; ++i, idx+=8) for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
{ {
x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx); x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
word64 y = (x >> 63); x = (x << 1) + carry; word64 y = (x >> 63); x = (x << 1) + carry;
PutWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx, x); PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
carry = y; carry = y;
} }
#else #else
word32 carry = 0, x; word32 carry = 0, x;
for (size_t i=0, idx=0; i<len/4; ++i, idx+=4) for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
{ {
x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx); x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
word32 y = (x >> 31); x = (x << 1) + carry; word32 y = (x >> 31); x = (x << 1) + carry;
PutWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx, x); PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
carry = y; carry = y;
} }
#endif #endif
@ -139,6 +94,7 @@ inline void GF_Double(byte *k, unsigned int len)
CRYPTOPP_ASSERT(len >= 16); CRYPTOPP_ASSERT(len >= 16);
CRYPTOPP_ASSERT(len <= 128); CRYPTOPP_ASSERT(len <= 128);
byte* k = out;
if (carry) if (carry)
{ {
switch (len) switch (len)
@ -184,6 +140,7 @@ inline void GF_Double(byte *k, unsigned int len)
#else #else
CRYPTOPP_ASSERT(len == 16); CRYPTOPP_ASSERT(len == 16);
byte* k = out;
if (carry) if (carry)
{ {
k[0] ^= 0x87; k[0] ^= 0x87;
@ -192,6 +149,11 @@ inline void GF_Double(byte *k, unsigned int len)
#endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS #endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
} }
inline void GF_Double(byte *inout, unsigned int len)
{
GF_Double(inout, inout, len);
}
#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING) #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
using CryptoPP::AES; using CryptoPP::AES;
@ -247,7 +209,8 @@ void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &
void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength) void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
{ {
BlockOrientedCipherModeBase::Resynchronize(iv, ivLength); BlockOrientedCipherModeBase::Resynchronize(iv, ivLength);
GetTweakCipher().ProcessBlock(m_register); std::memcpy(m_xregister, m_register, ivLength);
GetTweakCipher().ProcessBlock(m_xregister);
} }
void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order) void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
@ -257,37 +220,61 @@ void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
std::memset(iv+8, 0x00, iv.size()-8); std::memset(iv+8, 0x00, iv.size()-8);
BlockOrientedCipherModeBase::Resynchronize(iv, iv.size()); BlockOrientedCipherModeBase::Resynchronize(iv, iv.size());
GetTweakCipher().ProcessBlock(m_register); std::memcpy(m_xregister, iv, iv.size());
GetTweakCipher().ProcessBlock(m_xregister);
} }
void XTS_ModeBase::ResizeBuffers() void XTS_ModeBase::ResizeBuffers()
{ {
BlockOrientedCipherModeBase::ResizeBuffers(); BlockOrientedCipherModeBase::ResizeBuffers();
m_workspace.New(GetBlockCipher().BlockSize()); m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
} }
void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length) void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
{ {
const unsigned int blockSize = GetBlockCipher().BlockSize(); const unsigned int blockSize = GetBlockCipher().BlockSize();
const size_t parallelSize = blockSize*ParallelBlocks;
size_t i = 0;
// data unit is multiple of 16 bytes // data unit is multiple of 16 bytes
CRYPTOPP_ASSERT(length % blockSize == 0); CRYPTOPP_ASSERT(length % blockSize == 0);
// now encrypt the data unit, AES_BLK_BYTES at a time // encrypt the data unit, optimal size at a time
for (size_t i=0; i<length; i+=blockSize) for ( ; i+parallelSize<=length; i+=parallelSize)
{
// m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
// merge the tweak into the input block
XorBuffer(m_xworkspace, inString+i, m_xregister, parallelSize);
// encrypt one block, merge the tweak into the output block
GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister, outString+i, parallelSize, BlockTransformation::BT_AllowParallel);
// Multiply T by alpha. m_xregister[0] always points to the next tweak.
GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
}
// encrypt the data unit, blocksize at a time
for ( ; i<length; i+=blockSize)
{ {
// merge the tweak into the input block // merge the tweak into the input block
XorBuffer(m_workspace, inString+i, m_register, blockSize); XorBuffer(m_xworkspace, inString+i, m_xregister, blockSize);
// encrypt one block // encrypt one block
GetBlockCipher().ProcessBlock(m_workspace); GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block // merge the tweak into the output block
XorBuffer(outString+i, m_workspace, m_register, blockSize); XorBuffer(outString+i, m_xworkspace, m_xregister, blockSize);
// Multiply T by alpha // Multiply T by alpha
GF_Double(m_register, m_register.size()); GF_Double(m_xregister, blockSize);
} }
CRYPTOPP_ASSERT(i == length);
} }
size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength) size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
@ -310,8 +297,8 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
CRYPTOPP_ASSERT(outLength >= inLength); CRYPTOPP_ASSERT(outLength >= inLength);
const unsigned int blockSize = GetBlockCipher().BlockSize(); const unsigned int blockSize = GetBlockCipher().BlockSize();
const unsigned int blocks = inLength / blockSize; const size_t blocks = inLength / blockSize;
const unsigned int tail = inLength % blockSize; const size_t tail = inLength % blockSize;
outLength = inLength; outLength = inLength;
if (tail == 0) if (tail == 0)
@ -327,22 +314,22 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
ProcessData(outString, inString, inLength-head); ProcessData(outString, inString, inLength-head);
outString += head; outString += head;
inString += head; inLength -= head; inString += head; inLength -= head;
} }
///// handle the full block ///// ///// handle the full block /////
// merge the tweak into the input block // merge the tweak into the input block
XorBuffer(m_workspace, inString, m_register, blockSize); XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
// encrypt one block // encrypt one block
GetBlockCipher().ProcessBlock(m_workspace); GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block // merge the tweak into the output block
XorBuffer(outString, m_workspace, m_register, blockSize); XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
// Multiply T by alpha // Multiply T by alpha
GF_Double(m_register, m_register.size()); GF_Double(m_xregister, blockSize);
///// handle final partial block ///// ///// handle final partial block /////
@ -351,20 +338,20 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
const size_t len = inLength-blockSize; const size_t len = inLength-blockSize;
// copy in the final plaintext bytes // copy in the final plaintext bytes
std::memcpy(m_workspace, inString, len); std::memcpy(m_xworkspace, inString, len);
// and copy out the final ciphertext bytes // and copy out the final ciphertext bytes
std::memcpy(outString, outString-blockSize, len); std::memcpy(outString, outString-blockSize, len);
// "steal" ciphertext to complete the block // "steal" ciphertext to complete the block
std::memcpy(m_workspace+len, outString-blockSize+len, blockSize-len); std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
// merge the tweak into the input block // merge the tweak into the input block
XorBuffer(m_workspace, m_register, blockSize); XorBuffer(m_xworkspace, m_xregister, blockSize);
// encrypt one block // encrypt one block
GetBlockCipher().ProcessBlock(m_workspace); GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the previous output block // merge the tweak into the previous output block
XorBuffer(outString-blockSize, m_workspace, m_register, blockSize); XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
return outLength; return outLength;
} }
@ -375,8 +362,8 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
CRYPTOPP_ASSERT(outLength >= inLength); CRYPTOPP_ASSERT(outLength >= inLength);
const unsigned int blockSize = GetBlockCipher().BlockSize(); const unsigned int blockSize = GetBlockCipher().BlockSize();
const unsigned int blocks = inLength / blockSize; const size_t blocks = inLength / blockSize;
const unsigned int tail = inLength % blockSize; const size_t tail = inLength % blockSize;
outLength = inLength; outLength = inLength;
if (tail == 0) if (tail == 0)
@ -392,12 +379,12 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
ProcessData(outString, inString, inLength-head); ProcessData(outString, inString, inLength-head);
outString += head; outString += head;
inString += head; inLength -= head; inString += head; inLength -= head;
} }
SecByteBlock poly1(m_register); #define poly1 (m_xregister+0*blockSize)
SecByteBlock poly2(m_register); #define poly2 (m_xregister+1*blockSize)
GF_Double(poly2, poly2.size()); GF_Double(poly2, poly1, blockSize);
///// handle final partial block ///// ///// handle final partial block /////
@ -406,20 +393,20 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
const size_t len = inLength-blockSize; const size_t len = inLength-blockSize;
// merge the tweak into the input block // merge the tweak into the input block
XorBuffer(m_workspace, inString-blockSize, poly2, blockSize); XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
// encrypt one block // encrypt one block
GetBlockCipher().ProcessBlock(m_workspace); GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block // merge the tweak into the output block
XorBuffer(m_workspace, poly2, blockSize); XorBuffer(m_xworkspace, poly2, blockSize);
// copy in the final plaintext bytes // copy in the final plaintext bytes
std::memcpy(outString-blockSize, inString, len); std::memcpy(outString-blockSize, inString, len);
// and copy out the final ciphertext bytes // and copy out the final ciphertext bytes
std::memcpy(outString, m_workspace, len); std::memcpy(outString, m_xworkspace, len);
// "steal" ciphertext to complete the block // "steal" ciphertext to complete the block
std::memcpy(outString-blockSize+len, m_workspace+len, blockSize-len); std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
///// handle the full previous block ///// ///// handle the full previous block /////
@ -427,13 +414,13 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
outString -= blockSize; outString -= blockSize;
// merge the tweak into the input block // merge the tweak into the input block
XorBuffer(m_workspace, outString, poly1, blockSize); XorBuffer(m_xworkspace, outString, poly1, blockSize);
// encrypt one block // encrypt one block
GetBlockCipher().ProcessBlock(m_workspace); GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block // merge the tweak into the output block
XorBuffer(outString, m_workspace, poly1, blockSize); XorBuffer(outString, m_xworkspace, poly1, blockSize);
return outLength; return outLength;
} }

11
xts.h
View File

@ -49,6 +49,8 @@ NAMESPACE_BEGIN(CryptoPP)
class CRYPTOPP_NO_VTABLE XTS_ModeBase : public BlockOrientedCipherModeBase class CRYPTOPP_NO_VTABLE XTS_ModeBase : public BlockOrientedCipherModeBase
{ {
public: public:
virtual ~XTS_ModeBase() {}
std::string AlgorithmName() const std::string AlgorithmName() const
{return GetBlockCipher().AlgorithmName() + "/XTS";} {return GetBlockCipher().AlgorithmName() + "/XTS";}
std::string AlgorithmProvider() const std::string AlgorithmProvider() const
@ -70,6 +72,8 @@ public:
/// \return the block size of the cipher, in bytes /// \return the block size of the cipher, in bytes
unsigned int BlockSize() const unsigned int BlockSize() const
{return GetBlockCipher().BlockSize();} {return GetBlockCipher().BlockSize();}
unsigned int GetOptimalBlockSize() const
{return GetBlockCipher().BlockSize()*ParallelBlocks;}
unsigned int MinLastBlockSize() const unsigned int MinLastBlockSize() const
{return GetBlockCipher().BlockSize()+1;} {return GetBlockCipher().BlockSize()+1;}
unsigned int OptimalDataAlignment() const unsigned int OptimalDataAlignment() const
@ -102,7 +106,10 @@ protected:
const BlockCipher& GetTweakCipher() const const BlockCipher& GetTweakCipher() const
{return const_cast<XTS_ModeBase*>(this)->AccessTweakCipher();} {return const_cast<XTS_ModeBase*>(this)->AccessTweakCipher();}
SecByteBlock m_workspace; SecByteBlock m_xregister;
SecByteBlock m_xworkspace;
enum {ParallelBlocks = 4};
}; };
/// \brief XTS block cipher mode of operation implementation details /// \brief XTS block cipher mode of operation implementation details
@ -112,7 +119,7 @@ template <class CIPHER>
class CRYPTOPP_NO_VTABLE XTS_Final : public XTS_ModeBase class CRYPTOPP_NO_VTABLE XTS_Final : public XTS_ModeBase
{ {
public: public:
static const char* CRYPTOPP_API StaticAlgorithmName() CRYPTOPP_STATIC_CONSTEXPR const char* StaticAlgorithmName()
{return "XTS";} {return "XTS";}
protected: protected: