mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 09:59:42 +00:00
Make XTS mode parallelizable (GH #891)
On CoffeeLake performance increased from 3.4 cpb to 1.75 cpb. On Core2Duo performance increased from 27 cpb to 19 cpb.
This commit is contained in:
parent
c9b8452d57
commit
8e8e95cea2
177
xts.cpp
177
xts.cpp
@ -1,18 +1,11 @@
|
||||
// xts.cpp - written and placed in the public domain by Jeffrey Walton
|
||||
//
|
||||
// The best performance is achieved on machines with AES hardware acceleration.
|
||||
// However, 64-bit machines without hardware acceleration profit the most with
|
||||
// separate calls to ProcessBlock followed by XorBuffer rather than a single
|
||||
// call to AdvancedProcessBlocks. That's because we did not parallelize, and
|
||||
// XorBuffer uses SSE2 and ASIMD when available. Parallelizing slowed things
|
||||
// down due to copying m_register for GF_Double. XorBuffer profits without
|
||||
// AESNI and friends since XorBuffer only uses load, store and xor.
|
||||
|
||||
#include "pch.h"
|
||||
|
||||
#include "xts.h"
|
||||
#include "misc.h"
|
||||
#include "modes.h"
|
||||
#include "cpu.h"
|
||||
|
||||
#if defined(CRYPTOPP_DEBUG)
|
||||
# include "aes.h"
|
||||
@ -40,41 +33,6 @@ ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
using namespace CryptoPP;
|
||||
|
||||
// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
|
||||
// base architecture. We can use the SIMD code below without an
|
||||
// architecture option. No runtime tests are required. Unfortunately,
|
||||
// we can't use it on Altivec because an architecture switch is required.
|
||||
// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
|
||||
// 16-byte block sizes. count must be a multiple of 16 since SIMD words
|
||||
// are used.
|
||||
inline void XorBuffer(byte *buf, const byte *mask, size_t count)
|
||||
{
|
||||
CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
|
||||
CRYPTOPP_UNUSED(count);
|
||||
|
||||
#if defined(__SSE2__) || defined(_M_X64)
|
||||
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
|
||||
for (size_t i=0; i<count; i+=16)
|
||||
_mm_storeu_si128(M128_CAST(buf+i), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(mask+i)), _mm_loadu_si128(CONST_M128_CAST(buf+i))));
|
||||
#else
|
||||
_mm_storeu_si128(M128_CAST(buf), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(mask)), _mm_loadu_si128(CONST_M128_CAST(buf))));
|
||||
#endif
|
||||
|
||||
#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
|
||||
for (size_t i=0; i<count; i+=16)
|
||||
vst1q_u8(buf+i, veorq_u8(vld1q_u8(mask+i), vld1q_u8(buf+i)));
|
||||
#else
|
||||
vst1q_u8(buf, veorq_u8(vld1q_u8(mask), vld1q_u8(buf)));
|
||||
#endif
|
||||
|
||||
#else
|
||||
xorbuf(buf, mask, count);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
|
||||
// base architecture. We can use the SIMD code below without an
|
||||
// architecture option. No runtime tests are required. Unfortunately,
|
||||
@ -85,50 +43,47 @@ inline void XorBuffer(byte *buf, const byte *mask, size_t count)
|
||||
inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
|
||||
{
|
||||
CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
|
||||
CRYPTOPP_UNUSED(count);
|
||||
|
||||
#if defined(__SSE2__) || defined(_M_X64)
|
||||
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
|
||||
for (size_t i=0; i<count; i+=16)
|
||||
_mm_storeu_si128(M128_CAST(output+i), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(input+i)), _mm_loadu_si128(CONST_M128_CAST(mask+i))));
|
||||
#else
|
||||
_mm_storeu_si128(M128_CAST(output), _mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(input)), _mm_loadu_si128(CONST_M128_CAST(mask))));
|
||||
#endif
|
||||
_mm_storeu_si128(M128_CAST(output+i),
|
||||
_mm_xor_si128(
|
||||
_mm_loadu_si128(CONST_M128_CAST(input+i)),
|
||||
_mm_loadu_si128(CONST_M128_CAST(mask+i))));
|
||||
|
||||
#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
|
||||
for (size_t i=0; i<count; i+=16)
|
||||
vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
|
||||
#else
|
||||
vst1q_u8(output, veorq_u8(vld1q_u8(input), vld1q_u8(mask)));
|
||||
#endif
|
||||
|
||||
#else
|
||||
xorbuf(output, input, mask, count);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void XorBuffer(byte *buf, const byte *mask, size_t count)
|
||||
{
|
||||
XorBuffer(buf, buf, mask, count);
|
||||
}
|
||||
|
||||
// Borrowed from CMAC, but little-endian representation
|
||||
inline void GF_Double(byte *k, unsigned int len)
|
||||
inline void GF_Double(byte *out, const byte* in, unsigned int len)
|
||||
{
|
||||
#if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
|
||||
word64 carry = 0, x;
|
||||
for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
|
||||
{
|
||||
x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx);
|
||||
x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
|
||||
word64 y = (x >> 63); x = (x << 1) + carry;
|
||||
PutWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx, x);
|
||||
PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
|
||||
carry = y;
|
||||
}
|
||||
#else
|
||||
word32 carry = 0, x;
|
||||
for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
|
||||
{
|
||||
x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx);
|
||||
x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
|
||||
word32 y = (x >> 31); x = (x << 1) + carry;
|
||||
PutWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx, x);
|
||||
PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
|
||||
carry = y;
|
||||
}
|
||||
#endif
|
||||
@ -139,6 +94,7 @@ inline void GF_Double(byte *k, unsigned int len)
|
||||
CRYPTOPP_ASSERT(len >= 16);
|
||||
CRYPTOPP_ASSERT(len <= 128);
|
||||
|
||||
byte* k = out;
|
||||
if (carry)
|
||||
{
|
||||
switch (len)
|
||||
@ -184,6 +140,7 @@ inline void GF_Double(byte *k, unsigned int len)
|
||||
#else
|
||||
CRYPTOPP_ASSERT(len == 16);
|
||||
|
||||
byte* k = out;
|
||||
if (carry)
|
||||
{
|
||||
k[0] ^= 0x87;
|
||||
@ -192,6 +149,11 @@ inline void GF_Double(byte *k, unsigned int len)
|
||||
#endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
|
||||
}
|
||||
|
||||
inline void GF_Double(byte *inout, unsigned int len)
|
||||
{
|
||||
GF_Double(inout, inout, len);
|
||||
}
|
||||
|
||||
#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
||||
|
||||
using CryptoPP::AES;
|
||||
@ -247,7 +209,8 @@ void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &
|
||||
void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
|
||||
{
|
||||
BlockOrientedCipherModeBase::Resynchronize(iv, ivLength);
|
||||
GetTweakCipher().ProcessBlock(m_register);
|
||||
std::memcpy(m_xregister, m_register, ivLength);
|
||||
GetTweakCipher().ProcessBlock(m_xregister);
|
||||
}
|
||||
|
||||
void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
|
||||
@ -257,37 +220,61 @@ void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
|
||||
std::memset(iv+8, 0x00, iv.size()-8);
|
||||
|
||||
BlockOrientedCipherModeBase::Resynchronize(iv, iv.size());
|
||||
GetTweakCipher().ProcessBlock(m_register);
|
||||
std::memcpy(m_xregister, iv, iv.size());
|
||||
GetTweakCipher().ProcessBlock(m_xregister);
|
||||
}
|
||||
|
||||
void XTS_ModeBase::ResizeBuffers()
|
||||
{
|
||||
BlockOrientedCipherModeBase::ResizeBuffers();
|
||||
m_workspace.New(GetBlockCipher().BlockSize());
|
||||
m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
|
||||
m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
|
||||
}
|
||||
|
||||
void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
|
||||
{
|
||||
const unsigned int blockSize = GetBlockCipher().BlockSize();
|
||||
const size_t parallelSize = blockSize*ParallelBlocks;
|
||||
size_t i = 0;
|
||||
|
||||
// data unit is multiple of 16 bytes
|
||||
CRYPTOPP_ASSERT(length % blockSize == 0);
|
||||
|
||||
// now encrypt the data unit, AES_BLK_BYTES at a time
|
||||
for (size_t i=0; i<length; i+=blockSize)
|
||||
// encrypt the data unit, optimal size at a time
|
||||
for ( ; i+parallelSize<=length; i+=parallelSize)
|
||||
{
|
||||
// m_xregister[0] always points to the next tweak.
|
||||
GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
|
||||
GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
|
||||
GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
|
||||
|
||||
// merge the tweak into the input block
|
||||
XorBuffer(m_xworkspace, inString+i, m_xregister, parallelSize);
|
||||
|
||||
// encrypt one block, merge the tweak into the output block
|
||||
GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister, outString+i, parallelSize, BlockTransformation::BT_AllowParallel);
|
||||
|
||||
// Multiply T by alpha. m_xregister[0] always points to the next tweak.
|
||||
GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
|
||||
}
|
||||
|
||||
// encrypt the data unit, blocksize at a time
|
||||
for ( ; i<length; i+=blockSize)
|
||||
{
|
||||
// merge the tweak into the input block
|
||||
XorBuffer(m_workspace, inString+i, m_register, blockSize);
|
||||
XorBuffer(m_xworkspace, inString+i, m_xregister, blockSize);
|
||||
|
||||
// encrypt one block
|
||||
GetBlockCipher().ProcessBlock(m_workspace);
|
||||
GetBlockCipher().ProcessBlock(m_xworkspace);
|
||||
|
||||
// merge the tweak into the output block
|
||||
XorBuffer(outString+i, m_workspace, m_register, blockSize);
|
||||
XorBuffer(outString+i, m_xworkspace, m_xregister, blockSize);
|
||||
|
||||
// Multiply T by alpha
|
||||
GF_Double(m_register, m_register.size());
|
||||
GF_Double(m_xregister, blockSize);
|
||||
}
|
||||
|
||||
CRYPTOPP_ASSERT(i == length);
|
||||
}
|
||||
|
||||
size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
|
||||
@ -310,8 +297,8 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
|
||||
CRYPTOPP_ASSERT(outLength >= inLength);
|
||||
|
||||
const unsigned int blockSize = GetBlockCipher().BlockSize();
|
||||
const unsigned int blocks = inLength / blockSize;
|
||||
const unsigned int tail = inLength % blockSize;
|
||||
const size_t blocks = inLength / blockSize;
|
||||
const size_t tail = inLength % blockSize;
|
||||
outLength = inLength;
|
||||
|
||||
if (tail == 0)
|
||||
@ -327,22 +314,22 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
|
||||
ProcessData(outString, inString, inLength-head);
|
||||
|
||||
outString += head;
|
||||
inString += head; inLength -= head;
|
||||
inString += head; inLength -= head;
|
||||
}
|
||||
|
||||
///// handle the full block /////
|
||||
|
||||
// merge the tweak into the input block
|
||||
XorBuffer(m_workspace, inString, m_register, blockSize);
|
||||
XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
|
||||
|
||||
// encrypt one block
|
||||
GetBlockCipher().ProcessBlock(m_workspace);
|
||||
GetBlockCipher().ProcessBlock(m_xworkspace);
|
||||
|
||||
// merge the tweak into the output block
|
||||
XorBuffer(outString, m_workspace, m_register, blockSize);
|
||||
XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
|
||||
|
||||
// Multiply T by alpha
|
||||
GF_Double(m_register, m_register.size());
|
||||
GF_Double(m_xregister, blockSize);
|
||||
|
||||
///// handle final partial block /////
|
||||
|
||||
@ -351,20 +338,20 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
|
||||
const size_t len = inLength-blockSize;
|
||||
|
||||
// copy in the final plaintext bytes
|
||||
std::memcpy(m_workspace, inString, len);
|
||||
std::memcpy(m_xworkspace, inString, len);
|
||||
// and copy out the final ciphertext bytes
|
||||
std::memcpy(outString, outString-blockSize, len);
|
||||
// "steal" ciphertext to complete the block
|
||||
std::memcpy(m_workspace+len, outString-blockSize+len, blockSize-len);
|
||||
std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
|
||||
|
||||
// merge the tweak into the input block
|
||||
XorBuffer(m_workspace, m_register, blockSize);
|
||||
XorBuffer(m_xworkspace, m_xregister, blockSize);
|
||||
|
||||
// encrypt one block
|
||||
GetBlockCipher().ProcessBlock(m_workspace);
|
||||
GetBlockCipher().ProcessBlock(m_xworkspace);
|
||||
|
||||
// merge the tweak into the previous output block
|
||||
XorBuffer(outString-blockSize, m_workspace, m_register, blockSize);
|
||||
XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
|
||||
|
||||
return outLength;
|
||||
}
|
||||
@ -375,8 +362,8 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
|
||||
CRYPTOPP_ASSERT(outLength >= inLength);
|
||||
|
||||
const unsigned int blockSize = GetBlockCipher().BlockSize();
|
||||
const unsigned int blocks = inLength / blockSize;
|
||||
const unsigned int tail = inLength % blockSize;
|
||||
const size_t blocks = inLength / blockSize;
|
||||
const size_t tail = inLength % blockSize;
|
||||
outLength = inLength;
|
||||
|
||||
if (tail == 0)
|
||||
@ -392,12 +379,12 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
|
||||
ProcessData(outString, inString, inLength-head);
|
||||
|
||||
outString += head;
|
||||
inString += head; inLength -= head;
|
||||
inString += head; inLength -= head;
|
||||
}
|
||||
|
||||
SecByteBlock poly1(m_register);
|
||||
SecByteBlock poly2(m_register);
|
||||
GF_Double(poly2, poly2.size());
|
||||
#define poly1 (m_xregister+0*blockSize)
|
||||
#define poly2 (m_xregister+1*blockSize)
|
||||
GF_Double(poly2, poly1, blockSize);
|
||||
|
||||
///// handle final partial block /////
|
||||
|
||||
@ -406,20 +393,20 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
|
||||
const size_t len = inLength-blockSize;
|
||||
|
||||
// merge the tweak into the input block
|
||||
XorBuffer(m_workspace, inString-blockSize, poly2, blockSize);
|
||||
XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
|
||||
|
||||
// encrypt one block
|
||||
GetBlockCipher().ProcessBlock(m_workspace);
|
||||
GetBlockCipher().ProcessBlock(m_xworkspace);
|
||||
|
||||
// merge the tweak into the output block
|
||||
XorBuffer(m_workspace, poly2, blockSize);
|
||||
XorBuffer(m_xworkspace, poly2, blockSize);
|
||||
|
||||
// copy in the final plaintext bytes
|
||||
std::memcpy(outString-blockSize, inString, len);
|
||||
// and copy out the final ciphertext bytes
|
||||
std::memcpy(outString, m_workspace, len);
|
||||
std::memcpy(outString, m_xworkspace, len);
|
||||
// "steal" ciphertext to complete the block
|
||||
std::memcpy(outString-blockSize+len, m_workspace+len, blockSize-len);
|
||||
std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
|
||||
|
||||
///// handle the full previous block /////
|
||||
|
||||
@ -427,13 +414,13 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
|
||||
outString -= blockSize;
|
||||
|
||||
// merge the tweak into the input block
|
||||
XorBuffer(m_workspace, outString, poly1, blockSize);
|
||||
XorBuffer(m_xworkspace, outString, poly1, blockSize);
|
||||
|
||||
// encrypt one block
|
||||
GetBlockCipher().ProcessBlock(m_workspace);
|
||||
GetBlockCipher().ProcessBlock(m_xworkspace);
|
||||
|
||||
// merge the tweak into the output block
|
||||
XorBuffer(outString, m_workspace, poly1, blockSize);
|
||||
XorBuffer(outString, m_xworkspace, poly1, blockSize);
|
||||
|
||||
return outLength;
|
||||
}
|
||||
|
11
xts.h
11
xts.h
@ -49,6 +49,8 @@ NAMESPACE_BEGIN(CryptoPP)
|
||||
class CRYPTOPP_NO_VTABLE XTS_ModeBase : public BlockOrientedCipherModeBase
|
||||
{
|
||||
public:
|
||||
virtual ~XTS_ModeBase() {}
|
||||
|
||||
std::string AlgorithmName() const
|
||||
{return GetBlockCipher().AlgorithmName() + "/XTS";}
|
||||
std::string AlgorithmProvider() const
|
||||
@ -70,6 +72,8 @@ public:
|
||||
/// \return the block size of the cipher, in bytes
|
||||
unsigned int BlockSize() const
|
||||
{return GetBlockCipher().BlockSize();}
|
||||
unsigned int GetOptimalBlockSize() const
|
||||
{return GetBlockCipher().BlockSize()*ParallelBlocks;}
|
||||
unsigned int MinLastBlockSize() const
|
||||
{return GetBlockCipher().BlockSize()+1;}
|
||||
unsigned int OptimalDataAlignment() const
|
||||
@ -102,7 +106,10 @@ protected:
|
||||
const BlockCipher& GetTweakCipher() const
|
||||
{return const_cast<XTS_ModeBase*>(this)->AccessTweakCipher();}
|
||||
|
||||
SecByteBlock m_workspace;
|
||||
SecByteBlock m_xregister;
|
||||
SecByteBlock m_xworkspace;
|
||||
|
||||
enum {ParallelBlocks = 4};
|
||||
};
|
||||
|
||||
/// \brief XTS block cipher mode of operation implementation details
|
||||
@ -112,7 +119,7 @@ template <class CIPHER>
|
||||
class CRYPTOPP_NO_VTABLE XTS_Final : public XTS_ModeBase
|
||||
{
|
||||
public:
|
||||
static const char* CRYPTOPP_API StaticAlgorithmName()
|
||||
CRYPTOPP_STATIC_CONSTEXPR const char* StaticAlgorithmName()
|
||||
{return "XTS";}
|
||||
|
||||
protected:
|
||||
|
Loading…
Reference in New Issue
Block a user