SSE2 optimizations

This commit is contained in:
weidai 2007-04-15 22:54:31 +00:00
parent 20833349d1
commit bbbd09553b
6 changed files with 771 additions and 194 deletions

View File

@ -3,37 +3,296 @@
#include "pch.h"
#include "panama.h"
#include "misc.h"
#include "cpu.h"
NAMESPACE_BEGIN(CryptoPP)
template <class B>
void Panama<B>::Reset()
{
m_bstart = 0;
memset(m_state, 0, m_state.size()*4);
memset(m_state, 0, m_state.SizeInBytes());
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
m_state[17] = HasSSSE3();
#endif
}
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
{
#ifdef __GNUC__
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS1( push ebx)
#else
AS2( mov ecx, count)
AS2( mov esi, state)
AS2( mov edi, z)
AS2( mov edx, y)
#endif
AS2( shl ecx, 5)
ASJ( jz, 5, f)
AS2( mov ebx, [esi+4*17])
AS2( add ecx, ebx)
AS1( push ebp)
AS1( push ecx)
AS2( movdqa xmm0, [esi+0*16])
AS2( movdqa xmm1, [esi+1*16])
AS2( movdqa xmm2, [esi+2*16])
AS2( movdqa xmm3, [esi+3*16])
AS2( mov eax, [esi+4*16])
ASL(4)
// gamma and pi
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test ebx, 1)
ASJ( jnz, 6, f)
#endif
AS2( movdqa xmm6, xmm2)
AS2( movss xmm6, xmm3)
ASS( pshufd xmm5, xmm6, 0, 3, 2, 1)
AS2( movd xmm6, eax)
AS2( movdqa xmm7, xmm3)
AS2( movss xmm7, xmm6)
ASS( pshufd xmm6, xmm7, 0, 3, 2, 1)
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
ASJ( jmp, 7, f)
ASL(6)
AS2( movdqa xmm5, xmm3)
AS3( palignr xmm5, xmm2, 4)
AS2( movd xmm6, eax)
AS3( palignr xmm6, xmm3, 4)
ASL(7)
#endif
AS2( movd ecx, xmm2)
AS1( not ecx)
AS2( movd ebp, xmm3)
AS2( or ecx, ebp)
AS2( xor eax, ecx)
#define SSE2_Index(i) ASM_MOD(((i)*13+16), 17)
#define pi(i) \
AS2( movd ecx, xmm7)\
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
AS2( mov [esi+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
#define pi4(x, y, z, a, b, c, d) \
AS2( pcmpeqb xmm7, xmm7)\
AS2( pxor xmm7, x)\
AS2( por xmm7, y)\
AS2( pxor xmm7, z)\
pi(a)\
ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\
pi(b)\
AS2( punpckhqdq xmm7, xmm7)\
pi(c)\
ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\
pi(d)
pi4(xmm1, xmm2, xmm3, 1, 5, 9, 13)
pi4(xmm0, xmm1, xmm2, 2, 6, 10, 14)
pi4(xmm6, xmm0, xmm1, 3, 7, 11, 15)
pi4(xmm5, xmm6, xmm0, 4, 8, 12, 16)
// output keystream and update buffer here to hide partial memory stalls between pi and theta
AS2( movdqa xmm4, xmm3)
AS2( punpcklqdq xmm3, xmm2) // 1 5 2 6
AS2( punpckhdq xmm4, xmm2) // 9 10 13 14
AS2( movdqa xmm2, xmm1)
AS2( punpcklqdq xmm1, xmm0) // 3 7 4 8
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
// keystream
AS2( test edi, edi)
ASJ( jz, 0, f)
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm2)
AS2( punpckhqdq xmm6, xmm2)
AS2( test edx, 0xf)
ASJ( jnz, 2, f)
AS2( test edx, edx)
ASJ( jz, 1, f)
AS2( pxor xmm4, [edx])
AS2( pxor xmm6, [edx+16])
AS2( add edx, 32)
ASJ( jmp, 1, f)
ASL(2)
AS2( movdqu xmm0, [edx])
AS2( movdqu xmm2, [edx+16])
AS2( pxor xmm4, xmm0)
AS2( pxor xmm6, xmm2)
AS2( add edx, 32)
ASL(1)
AS2( test edi, 0xf)
ASJ( jnz, 3, f)
AS2( movdqa [edi], xmm4)
AS2( movdqa [edi+16], xmm6)
AS2( add edi, 32)
ASJ( jmp, 0, f)
ASL(3)
AS2( movdqu [edi], xmm4)
AS2( movdqu [edi+16], xmm6)
AS2( add edi, 32)
ASL(0)
// buffer update
AS2( lea ecx, [ebx + 32])
AS2( and ecx, 31*32)
AS2( lea ebp, [ebx + (32-24)*32])
AS2( and ebp, 31*32)
AS2( movdqa xmm0, [esi+20*4+ecx+0*8])
AS2( pxor xmm3, xmm0)
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
AS2( movdqa [esi+20*4+ecx+0*8], xmm3)
AS2( pxor xmm0, [esi+20*4+ebp+2*8])
AS2( movdqa [esi+20*4+ebp+2*8], xmm0)
AS2( movdqa xmm4, [esi+20*4+ecx+2*8])
AS2( pxor xmm1, xmm4)
AS2( movdqa [esi+20*4+ecx+2*8], xmm1)
AS2( pxor xmm4, [esi+20*4+ebp+0*8])
AS2( movdqa [esi+20*4+ebp+0*8], xmm4)
// theta
AS2( movdqa xmm3, [esi+3*16])
AS2( movdqa xmm2, [esi+2*16])
AS2( movdqa xmm1, [esi+1*16])
AS2( movdqa xmm0, [esi+0*16])
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test ebx, 1)
ASJ( jnz, 8, f)
#endif
AS2( movd xmm6, eax)
AS2( movdqa xmm7, xmm3)
AS2( movss xmm7, xmm6)
AS2( movdqa xmm6, xmm2)
AS2( movss xmm6, xmm3)
AS2( movdqa xmm5, xmm1)
AS2( movss xmm5, xmm2)
AS2( movdqa xmm4, xmm0)
AS2( movss xmm4, xmm1)
ASS( pshufd xmm7, xmm7, 0, 3, 2, 1)
ASS( pshufd xmm6, xmm6, 0, 3, 2, 1)
ASS( pshufd xmm5, xmm5, 0, 3, 2, 1)
ASS( pshufd xmm4, xmm4, 0, 3, 2, 1)
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
ASJ( jmp, 9, f)
ASL(8)
AS2( movd xmm7, eax)
AS3( palignr xmm7, xmm3, 4)
AS2( movq xmm6, xmm3)
AS3( palignr xmm6, xmm2, 4)
AS2( movq xmm5, xmm2)
AS3( palignr xmm5, xmm1, 4)
AS2( movq xmm4, xmm1)
AS3( palignr xmm4, xmm0, 4)
ASL(9)
#endif
AS2( xor eax, 1)
AS2( movd ecx, xmm0)
AS2( xor eax, ecx)
AS2( movd ecx, xmm3)
AS2( xor eax, ecx)
AS2( pxor xmm3, xmm2)
AS2( pxor xmm2, xmm1)
AS2( pxor xmm1, xmm0)
AS2( pxor xmm0, xmm7)
AS2( pxor xmm3, xmm7)
AS2( pxor xmm2, xmm6)
AS2( pxor xmm1, xmm5)
AS2( pxor xmm0, xmm4)
// sigma
AS2( lea ecx, [ebx + (32-4)*32])
AS2( and ecx, 31*32)
AS2( lea ebp, [ebx + 16*32])
AS2( and ebp, 31*32)
AS2( movdqa xmm4, [esi+20*4+ecx+0*16])
AS2( movdqa xmm5, [esi+20*4+ebp+0*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm3, xmm4)
AS2( pxor xmm2, xmm6)
AS2( movdqa xmm4, [esi+20*4+ecx+1*16])
AS2( movdqa xmm5, [esi+20*4+ebp+1*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm1, xmm4)
AS2( pxor xmm0, xmm6)
// loop
AS2( add ebx, 32)
AS2( cmp ebx, [esp])
ASJ( jne, 4, b)
// save state
AS2( mov ebp, [esp+4])
AS2( add esp, 8)
AS2( mov [esi+4*17], ebx)
AS2( mov [esi+4*16], eax)
AS2( movdqa [esi+3*16], xmm3)
AS2( movdqa [esi+2*16], xmm2)
AS2( movdqa [esi+1*16], xmm1)
AS2( movdqa [esi+0*16], xmm0)
ASL(5)
#ifdef __GNUC__
AS1( pop ebx)
".att_syntax prefix;"
:
: "c" (count), "S" (state), "D" (z), "d" (y)
: "%eax", "memory", "cc"
);
#endif
}
#endif
template <class B>
void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y)
{
unsigned int bstart = m_bstart;
word32 *const a = m_state;
#define c (a+17)
#define b ((Stage *)(a+34))
word32 bstart = m_state[17];
word32 *const aPtr = m_state;
word32 cPtr[17];
#define bPtr ((byte *)(aPtr+20))
// reorder the state for SSE2
// a and c: 4 8 12 16 | 3 7 11 15 | 2 6 10 14 | 1 5 9 13 | 0
// xmm0 xmm1 xmm2 xmm3 eax
#define a(i) aPtr[((i)*13+16) % 17] // 13 is inverse of 4 mod 17
#define c(i) cPtr[((i)*13+16) % 17]
// b: 0 4 | 1 5 | 2 6 | 3 7
#define b(i, j) b##i[(j)*2%8 + (j)/4]
// output
#define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a[i+9])
#define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a[i+9])
#define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a(i+9))
#define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a(i+9))
// buffer update
#define US(i) {word32 t=b0[i]; b0[i]=ConditionalByteReverse(B::ToEnum(), p[i])^t; b25[(i+6)%8]^=t;}
#define UL(i) {word32 t=b0[i]; b0[i]=a[i+1]^t; b25[(i+6)%8]^=t;}
#define US(i) {word32 t=b(0,i); b(0,i)=ConditionalByteReverse(B::ToEnum(), p[i])^t; b(25,(i+6)%8)^=t;}
#define UL(i) {word32 t=b(0,i); b(0,i)=a(i+1)^t; b(25,(i+6)%8)^=t;}
// gamma and pi
#define GP(i) c[5*i%17] = rotlFixed(a[i] ^ (a[(i+1)%17] | ~a[(i+2)%17]), ((5*i%17)*((5*i%17)+1)/2)%32)
#define GP(i) c(5*i%17) = rotlFixed(a(i) ^ (a((i+1)%17) | ~a((i+2)%17)), ((5*i%17)*((5*i%17)+1)/2)%32)
// theta and sigma
#define T(i,x) a[i] = c[i] ^ c[(i+1)%17] ^ c[(i+4)%17] ^ x
#define T(i,x) a(i) = c(i) ^ c((i+1)%17) ^ c((i+4)%17) ^ x
#define TS1S(i) T(i+1, ConditionalByteReverse(B::ToEnum(), p[i]))
#define TS1L(i) T(i+1, b4[i])
#define TS2(i) T(i+9, b16[i])
#define TS1L(i) T(i+1, b(4,i))
#define TS2(i) T(i+9, b(16,i))
while (count--)
{
@ -51,12 +310,11 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *
z += 8;
}
word32 *const b16 = b[(bstart+16) % STAGES];
word32 *const b4 = b[(bstart+4) % STAGES];
bstart = (bstart + STAGES - 1) % STAGES;
word32 *const b0 = b[bstart];
word32 *const b25 = b[(bstart+25) % STAGES];
word32 *const b16 = (word32 *)(bPtr+((bstart+16*32) & 31*32));
word32 *const b4 = (word32 *)(bPtr+((bstart+(32-4)*32) & 31*32));
bstart += 32;
word32 *const b0 = (word32 *)(bPtr+((bstart) & 31*32));
word32 *const b25 = (word32 *)(bPtr+((bstart+(32-25)*32) & 31*32));
if (p)
{
@ -67,8 +325,23 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *
UL(0); UL(1); UL(2); UL(3); UL(4); UL(5); UL(6); UL(7);
}
GP(0); GP(1); GP(2); GP(3); GP(4); GP(5); GP(6); GP(7);
GP(8); GP(9); GP(10); GP(11); GP(12); GP(13); GP(14); GP(15); GP(16);
GP(0);
GP(1);
GP(2);
GP(3);
GP(4);
GP(5);
GP(6);
GP(7);
GP(8);
GP(9);
GP(10);
GP(11);
GP(12);
GP(13);
GP(14);
GP(15);
GP(16);
T(0,1);
@ -84,18 +357,18 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *
TS2(0); TS2(1); TS2(2); TS2(3); TS2(4); TS2(5); TS2(6); TS2(7);
}
m_bstart = bstart;
m_state[17] = bstart;
}
template <class B>
size_t PanamaHash<B>::HashMultipleBlocks(const word32 *input, size_t length)
size_t Weak::PanamaHash<B>::HashMultipleBlocks(const word32 *input, size_t length)
{
this->Iterate(length / this->BLOCKSIZE, input);
return length % this->BLOCKSIZE;
}
template <class B>
void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
void Weak::PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
{
this->ThrowIfInvalidTruncatedSize(size);
@ -105,8 +378,10 @@ void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
this->Iterate(32); // pull
ConditionalByteReverse(B::ToEnum(), this->m_state+9, this->m_state+9, DIGESTSIZE);
memcpy(hash, this->m_state+9, size);
FixedSizeSecBlock<word32, 8> buf;
this->Iterate(1, NULL, buf, NULL);
memcpy(hash, buf, size);
this->Restart(); // reinit for next use
}
@ -114,31 +389,64 @@ void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
template <class B>
void PanamaCipherPolicy<B>::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
{
FixedSizeSecBlock<word32, 8> buf;
this->Reset();
memcpy(buf, key, 32);
this->Iterate(1, buf);
if (length == 64)
memcpy(buf, key+32, 32);
else
memset(buf, 0, 32);
this->Iterate(1, buf);
this->Iterate(32);
assert(length==32);
memcpy(m_key, key, 32);
}
template <class B>
void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
{
this->Reset();
this->Iterate(1, m_key);
if (iv && IsAligned<word32>(iv))
this->Iterate(1, (const word32 *)iv);
else
{
FixedSizeSecBlock<word32, 8> buf;
if (iv)
memcpy(buf, iv, 32);
else
memset(buf, 0, 32);
this->Iterate(1, buf);
}
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
Panama_SSE2_Pull(32, this->m_state, NULL, NULL);
else
#endif
this->Iterate(32);
}
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
template <class B>
unsigned int PanamaCipherPolicy<B>::GetAlignment() const
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
return 16;
else
#endif
return 1;
}
#endif
template <class B>
void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{
this->Iterate(iterationCount, NULL, (word32 *)output, (const word32 *)input);
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input);
else
#endif
this->Iterate(iterationCount, NULL, (word32 *)output, (const word32 *)input);
}
template class Panama<BigEndian>;
template class Panama<LittleEndian>;
template class PanamaHash<BigEndian>;
template class PanamaHash<LittleEndian>;
template class Weak::PanamaHash<BigEndian>;
template class Weak::PanamaHash<LittleEndian>;
template class PanamaCipherPolicy<BigEndian>;
template class PanamaCipherPolicy<LittleEndian>;

View File

@ -1,8 +1,6 @@
#ifndef CRYPTOPP_PANAMA_H
#define CRYPTOPP_PANAMA_H
#include "seckey.h"
#include "secblock.h"
#include "strciphr.h"
#include "iterhash.h"
@ -20,10 +18,10 @@ protected:
typedef word32 Stage[8];
CRYPTOPP_CONSTANT(STAGES = 32)
FixedSizeSecBlock<word32, 17*2 + 32*sizeof(Stage)> m_state;
unsigned int m_bstart;
FixedSizeAlignedSecBlock<word32, 20 + 8*32> m_state;
};
namespace Weak {
/// <a href="http://www.weidai.com/scan-mirror/md.html#Panama">Panama Hash</a>
template <class B = LittleEndian>
class PanamaHash : protected Panama<B>, public AlgorithmImpl<IteratedHash<word32, NativeByteOrder, 32>, PanamaHash<B> >
@ -39,7 +37,9 @@ protected:
void Init() {Panama<B>::Reset();}
void HashEndianCorrectedBlock(const word32 *data) {this->Iterate(1, data);} // push
size_t HashMultipleBlocks(const word32 *input, size_t length);
word32* StateBuf() {return NULL;}
};
}
//! MAC construction using a hermetic hash function
template <class T_Hash, class T_Info = T_Hash>
@ -94,6 +94,7 @@ protected:
SecByteBlock m_key;
};
namespace Weak {
/// Panama MAC
template <class B = LittleEndian>
class PanamaMAC : public HermeticHashFunctionMAC<PanamaHash<B> >
@ -103,10 +104,11 @@ public:
PanamaMAC(const byte *key, unsigned int length)
{this->SetKey(key, length);}
};
}
//! algorithm info
template <class B>
struct PanamaCipherInfo : public VariableKeyLength<32, 32, 64, 32, SimpleKeyingInterface::NOT_RESYNCHRONIZABLE>
struct PanamaCipherInfo : public FixedKeyLength<32, SimpleKeyingInterface::UNIQUE_IV, 32>
{
static const char * StaticAlgorithmName() {return B::ToEnum() == BIG_ENDIAN_ORDER ? "Panama-BE" : "Panama-LE";}
};
@ -121,9 +123,15 @@ protected:
void CipherSetKey(const NameValuePairs &params, const byte *key, size_t length);
void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount);
bool IsRandomAccess() const {return false;}
void CipherResynchronize(byte *keystreamBuffer, const byte *iv);
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int GetAlignment() const;
#endif
FixedSizeSecBlock<word32, 8> m_key;
};
//! <a href="http://www.weidai.com/scan-mirror/cs.html#Panama">Panama Stream Cipher</a>
//! <a href="http://www.cryptolounge.org/wiki/PANAMA">Panama Stream Cipher</a>
template <class B = LittleEndian>
struct PanamaCipher : public PanamaCipherInfo<B>, public SymmetricCipherDocumentation
{

367
salsa.cpp
View File

@ -4,6 +4,9 @@
#include "salsa.h"
#include "misc.h"
#include "argnames.h"
#include "cpu.h"
#include <emmintrin.h>
NAMESPACE_BEGIN(CryptoPP)
@ -14,11 +17,13 @@ void Salsa20_TestInstantiations()
void Salsa20_Policy::CipherGetNextIV(byte *IV)
{
word32 j6 = m_state[6] + 1;
word32 j7 = m_state[7] + (j6 == 0);
word32 j6, j7;
UnalignedPutWord(LITTLE_ENDIAN_ORDER, IV, j6);
UnalignedPutWord(LITTLE_ENDIAN_ORDER, IV+4, j7);
j6 = m_state[14] + 1;
j7 = m_state[11] + (j6 == 0);
PutWord(false, LITTLE_ENDIAN_ORDER, IV, j6);
PutWord(false, LITTLE_ENDIAN_ORDER, IV+4, j7);
}
void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
@ -28,112 +33,304 @@ void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key,
if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
throw InvalidRounds(StaticAlgorithmName(), m_rounds);
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+1, 4, key, 16);
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+11, 4, key + length - 16, 16);
// m_state is reordered for SSE2
GetBlock<word32, LittleEndian, false> get1(key);
get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
GetBlock<word32, LittleEndian, false> get2(key + length - 16);
get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
// m_state[0,5,10,15] forms "expand 16-byte k" or "expand 32-byte k"
// "expand 16-byte k" or "expand 32-byte k"
m_state[0] = 0x61707865;
m_state[5] = (length == 16) ? 0x3120646e : 0x3320646e;
m_state[10] = (length == 16) ? 0x79622d36 : 0x79622d32;
m_state[15] = 0x6b206574;
m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
m_state[3] = 0x6b206574;
}
void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV)
{
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+6, 4, IV, 8);
GetBlock<word32, LittleEndian, false> get(IV);
get(m_state[14])(m_state[11]);
m_state[8] = m_state[5] = 0;
}
void Salsa20_Policy::SeekToIteration(lword iterationCount)
{
m_state[8] = (word32)iterationCount;
m_state[9] = (word32)SafeRightShift<32>(iterationCount);
m_state[5] = (word32)SafeRightShift<32>(iterationCount);
}
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int Salsa20_Policy::GetAlignment() const
{
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
if (HasSSE2())
return 16;
else
#endif
return 1;
}
unsigned int Salsa20_Policy::GetOptimalBlockSize() const
{
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
if (HasSSE2())
return 4*BYTES_PER_ITERATION;
else
#endif
return BYTES_PER_ITERATION;
}
#endif
void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{
KeystreamOutput<LittleEndian> keystreamOutput(operation, output, input);
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
word32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
j0 = m_state[0];
j1 = m_state[1];
j2 = m_state[2];
j3 = m_state[3];
j4 = m_state[4];
j5 = m_state[5];
j6 = m_state[6];
j7 = m_state[7];
j8 = m_state[8];
j9 = m_state[9];
j10 = m_state[10];
j11 = m_state[11];
j12 = m_state[12];
j13 = m_state[13];
j14 = m_state[14];
j15 = m_state[15];
for (size_t iteration = 0; iteration < iterationCount; ++iteration)
int i;
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
if (HasSSE2())
{
x0 = j0;
x1 = j1;
x2 = j2;
x3 = j3;
x4 = j4;
x5 = j5;
x6 = j6;
x7 = j7;
x8 = j8;
x9 = j9;
x10 = j10;
x11 = j11;
x12 = j12;
x13 = j13;
x14 = j14;
x15 = j15;
__m128i *s = (__m128i *)m_state.data();
for (int i=m_rounds; i>0; i-=2)
if (iterationCount >= 4)
{
#define QUARTER_ROUND(a, b, c, d) \
b = b ^ rotlFixed(a + d, 7); \
c = c ^ rotlFixed(b + a, 9); \
d = d ^ rotlFixed(c + b, 13); \
a = a ^ rotlFixed(d + c, 18);
__m128i ss[16];
ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0));
ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1));
ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2));
ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3));
ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0));
ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2));
ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3));
ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1));
ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2));
ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3));
ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0));
ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1));
ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2));
ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3));
QUARTER_ROUND(x0, x4, x8, x12)
QUARTER_ROUND(x5, x9, x13, x1)
QUARTER_ROUND(x10, x14, x2, x6)
QUARTER_ROUND(x15, x3, x7, x11)
do
{
word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]);
for (i=0; i<4; i++)
{
countersLo[i] = m_state[8];
countersHi[i] = m_state[5];
if (++m_state[8] == 0)
++m_state[5];
}
QUARTER_ROUND(x0, x1, x2, x3)
QUARTER_ROUND(x5, x6, x7, x4)
QUARTER_ROUND(x10, x11, x8, x9)
QUARTER_ROUND(x15, x12, x13, x14)
__m128i x0 = ss[0];
__m128i x1 = ss[1];
__m128i x2 = ss[2];
__m128i x3 = ss[3];
__m128i x4 = ss[4];
__m128i x5 = ss[5];
__m128i x6 = ss[6];
__m128i x7 = ss[7];
__m128i x8 = ss[8];
__m128i x9 = ss[9];
__m128i x10 = ss[10];
__m128i x11 = ss[11];
__m128i x12 = ss[12];
__m128i x13 = ss[13];
__m128i x14 = ss[14];
__m128i x15 = ss[15];
for (i=m_rounds; i>0; i-=2)
{
#define SSE2_QUARTER_ROUND(a, b, d, i) {\
__m128i t = _mm_add_epi32(a, d); \
b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \
b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));}
#define QUARTER_ROUND(a, b, c, d) \
SSE2_QUARTER_ROUND(a, b, d, 7) \
SSE2_QUARTER_ROUND(b, c, a, 9) \
SSE2_QUARTER_ROUND(c, d, b, 13) \
SSE2_QUARTER_ROUND(d, a, c, 18)
QUARTER_ROUND(x0, x4, x8, x12)
QUARTER_ROUND(x1, x5, x9, x13)
QUARTER_ROUND(x2, x6, x10, x14)
QUARTER_ROUND(x3, x7, x11, x15)
QUARTER_ROUND(x0, x13, x10, x7)
QUARTER_ROUND(x1, x14, x11, x4)
QUARTER_ROUND(x2, x15, x8, x5)
QUARTER_ROUND(x3, x12, x9, x6)
#undef QUARTER_ROUND
}
x0 = _mm_add_epi32(x0, ss[0]);
x1 = _mm_add_epi32(x1, ss[1]);
x2 = _mm_add_epi32(x2, ss[2]);
x3 = _mm_add_epi32(x3, ss[3]);
x4 = _mm_add_epi32(x4, ss[4]);
x5 = _mm_add_epi32(x5, ss[5]);
x6 = _mm_add_epi32(x6, ss[6]);
x7 = _mm_add_epi32(x7, ss[7]);
x8 = _mm_add_epi32(x8, ss[8]);
x9 = _mm_add_epi32(x9, ss[9]);
x10 = _mm_add_epi32(x10, ss[10]);
x11 = _mm_add_epi32(x11, ss[11]);
x12 = _mm_add_epi32(x12, ss[12]);
x13 = _mm_add_epi32(x13, ss[13]);
x14 = _mm_add_epi32(x14, ss[14]);
x15 = _mm_add_epi32(x15, ss[15]);
#define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\
__m128i t0 = _mm_unpacklo_epi32(a, b);\
__m128i t1 = _mm_unpacklo_epi32(c, d);\
__m128i t2 = _mm_unpacklo_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\
t2 = _mm_unpackhi_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\
t0 = _mm_unpackhi_epi32(a, b);\
t1 = _mm_unpackhi_epi32(c, d);\
t2 = _mm_unpacklo_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\
t2 = _mm_unpackhi_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)}
#define SALSA_OUTPUT(x) \
OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\
OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\
OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\
OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15)
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION)
#undef SALSA_OUTPUT
} while ((iterationCount-=4) >= 4);
}
keystreamOutput (x0 + j0)
(x1 + j1)
(x2 + j2)
(x3 + j3)
(x4 + j4)
(x5 + j5)
(x6 + j6)
(x7 + j7)
(x8 + j8)
(x9 + j9)
(x10 + j10)
(x11 + j11)
(x12 + j12)
(x13 + j13)
(x14 + j14)
(x15 + j15);
if (!IsP4()) while (iterationCount)
{
--iterationCount;
__m128i x0 = s[0];
__m128i x1 = s[1];
__m128i x2 = s[2];
__m128i x3 = s[3];
if (++j8 == 0)
++j9;
for (i=m_rounds; i>0; i-=2)
{
SSE2_QUARTER_ROUND(x0, x1, x3, 7)
SSE2_QUARTER_ROUND(x1, x2, x0, 9)
SSE2_QUARTER_ROUND(x2, x3, x1, 13)
SSE2_QUARTER_ROUND(x3, x0, x2, 18)
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
SSE2_QUARTER_ROUND(x0, x3, x1, 7)
SSE2_QUARTER_ROUND(x3, x2, x0, 9)
SSE2_QUARTER_ROUND(x2, x1, x3, 13)
SSE2_QUARTER_ROUND(x1, x0, x2, 18)
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
}
x0 = _mm_add_epi32(x0, s[0]);
x1 = _mm_add_epi32(x1, s[1]);
x2 = _mm_add_epi32(x2, s[2]);
x3 = _mm_add_epi32(x3, s[3]);
if (++m_state[8] == 0)
++m_state[5];
CRYPTOPP_ALIGN_DATA(16) static const word32 masks[8] CRYPTOPP_SECTION_ALIGN16 =
{0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 0, 0xffffffff, 0};
__m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32));
k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
__m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32));
k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
__m128i maskLo32 = ((__m128i*)masks)[1], maskHi32 = ((__m128i*)masks)[0];
__m128i k20 = _mm_or_si128(_mm_and_si128(x2, maskLo32), _mm_and_si128(x1, maskHi32));
__m128i k31 = _mm_or_si128(_mm_and_si128(x3, maskLo32), _mm_and_si128(x2, maskHi32));
__m128i k0 = _mm_unpackhi_epi64(k02, k20);
__m128i k1 = _mm_unpackhi_epi64(k13, k31);
__m128i k2 = _mm_unpacklo_epi64(k20, k02);
__m128i k3 = _mm_unpacklo_epi64(k31, k13);
#define SSE2_OUTPUT(x) {\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)}
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION);
}
}
#endif
m_state[8] = j8;
m_state[9] = j9;
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
while (iterationCount--)
{
x0 = m_state[0];
x1 = m_state[1];
x2 = m_state[2];
x3 = m_state[3];
x4 = m_state[4];
x5 = m_state[5];
x6 = m_state[6];
x7 = m_state[7];
x8 = m_state[8];
x9 = m_state[9];
x10 = m_state[10];
x11 = m_state[11];
x12 = m_state[12];
x13 = m_state[13];
x14 = m_state[14];
x15 = m_state[15];
for (i=m_rounds; i>0; i-=2)
{
#define QUARTER_ROUND(a, b, c, d) \
b = b ^ rotlFixed(a + d, 7); \
c = c ^ rotlFixed(b + a, 9); \
d = d ^ rotlFixed(c + b, 13); \
a = a ^ rotlFixed(d + c, 18);
QUARTER_ROUND(x0, x4, x8, x12)
QUARTER_ROUND(x1, x5, x9, x13)
QUARTER_ROUND(x2, x6, x10, x14)
QUARTER_ROUND(x3, x7, x11, x15)
QUARTER_ROUND(x0, x13, x10, x7)
QUARTER_ROUND(x1, x14, x11, x4)
QUARTER_ROUND(x2, x15, x8, x5)
QUARTER_ROUND(x3, x12, x9, x6)
}
#define SALSA_OUTPUT(x) {\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
if (++m_state[8] == 0)
++m_state[5];
}
}
NAMESPACE_END

10
salsa.h
View File

@ -8,7 +8,7 @@
NAMESPACE_BEGIN(CryptoPP)
//! _
struct Salsa20_Info : public VariableKeyLength<32, 16, 32, 16, SimpleKeyingInterface::STRUCTURED_IV, 8>
struct Salsa20_Info : public VariableKeyLength<32, 16, 32, 16, SimpleKeyingInterface::UNIQUE_IV, 8>
{
static const char *StaticAlgorithmName() {return "Salsa20";}
};
@ -22,13 +22,17 @@ protected:
void CipherResynchronize(byte *keystreamBuffer, const byte *IV);
bool IsRandomAccess() const {return true;}
void SeekToIteration(lword iterationCount);
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int GetAlignment() const;
unsigned int GetOptimalBlockSize() const;
#endif
private:
FixedSizeAlignedSecBlock<word32, 16> m_state;
int m_rounds;
FixedSizeSecBlock<word32, 16> m_state;
};
//! Salsa20, variable rounds: 8, 12 or 20 (default 20)
/// <a href="http://www.cryptolounge.org/wiki/Salsa20">Salsa20</a>, variable rounds: 8, 12 or 20 (default 20)
struct Salsa20 : public Salsa20_Info, public SymmetricCipherDocumentation
{
typedef SymmetricCipherFinal<ConcretePolicyHolder<Salsa20_Policy, AdditiveCipherTemplate<> >, Salsa20_Info> Encryption;

View File

@ -37,6 +37,57 @@ byte AdditiveCipherTemplate<S>::GenerateByte()
return *(KeystreamBufferEnd()-m_leftOver--);
}
template <class S>
void AdditiveCipherTemplate<S>::GenerateBlock(byte *outString, size_t length)
{
if (m_leftOver > 0)
{
size_t len = STDMIN(m_leftOver, length);
memcpy(outString, KeystreamBufferEnd()-m_leftOver, len);
length -= len;
m_leftOver -= len;
outString += len;
if (!length)
return;
}
assert(m_leftOver == 0);
PolicyInterface &policy = this->AccessPolicy();
unsigned int bytesPerIteration = policy.GetBytesPerIteration();
if (length >= bytesPerIteration)
{
size_t iterations = length / bytesPerIteration;
policy.WriteKeystream(outString, iterations);
outString += iterations * bytesPerIteration;
length -= iterations * bytesPerIteration;
if (!length)
return;
}
unsigned int bufferByteSize = GetBufferByteSize(policy);
unsigned int bufferIterations = policy.GetIterationsToBuffer();
while (length >= bufferByteSize)
{
policy.WriteKeystream(m_buffer, bufferIterations);
memcpy(outString, KeystreamBufferBegin(), bufferByteSize);
length -= bufferByteSize;
outString += bufferByteSize;
}
if (length > 0)
{
policy.WriteKeystream(m_buffer, bufferIterations);
memcpy(outString, KeystreamBufferBegin(), length);
m_leftOver = bytesPerIteration - length;
}
}
template <class S>
void AdditiveCipherTemplate<S>::ProcessData(byte *outString, const byte *inString, size_t length)
{
@ -48,29 +99,26 @@ void AdditiveCipherTemplate<S>::ProcessData(byte *outString, const byte *inStrin
m_leftOver -= len;
inString += len;
outString += len;
if (!length)
return;
}
if (!length)
return;
assert(m_leftOver == 0);
PolicyInterface &policy = this->AccessPolicy();
unsigned int bytesPerIteration = policy.GetBytesPerIteration();
unsigned int alignment = policy.GetAlignment();
if (policy.CanOperateKeystream() && length >= bytesPerIteration && IsAlignedOn(outString, alignment))
if (policy.CanOperateKeystream() && length >= bytesPerIteration)
{
if (IsAlignedOn(inString, alignment))
policy.OperateKeystream(XOR_KEYSTREAM, outString, inString, length / bytesPerIteration);
else
{
memcpy(outString, inString, length);
policy.OperateKeystream(XOR_KEYSTREAM_INPLACE, outString, outString, length / bytesPerIteration);
}
inString += length - length % bytesPerIteration;
outString += length - length % bytesPerIteration;
length %= bytesPerIteration;
size_t iterations = length / bytesPerIteration;
unsigned int alignment = policy.GetAlignment();
KeystreamOperation operation = KeystreamOperation((IsAlignedOn(inString, alignment) * 2) | (int)IsAlignedOn(outString, alignment));
policy.OperateKeystream(operation, outString, inString, iterations);
inString += iterations * bytesPerIteration;
outString += iterations * bytesPerIteration;
length -= iterations * bytesPerIteration;
if (!length)
return;

View File

@ -53,14 +53,23 @@ protected:
POLICY_INTERFACE & AccessPolicy() {return *this;}
};
enum KeystreamOperation {WRITE_KEYSTREAM, XOR_KEYSTREAM, XOR_KEYSTREAM_INPLACE};
enum KeystreamOperationFlags {OUTPUT_ALIGNED=1, INPUT_ALIGNED=2, INPUT_NULL = 4};
enum KeystreamOperation {
WRITE_KEYSTREAM = INPUT_NULL,
WRITE_KEYSTREAM_ALIGNED = INPUT_NULL | OUTPUT_ALIGNED,
XOR_KEYSTREAM = 0,
XOR_KEYSTREAM_INPUT_ALIGNED = INPUT_ALIGNED,
XOR_KEYSTREAM_OUTPUT_ALIGNED= OUTPUT_ALIGNED,
XOR_KEYSTREAM_BOTH_ALIGNED = OUTPUT_ALIGNED | INPUT_ALIGNED};
struct CRYPTOPP_DLL CRYPTOPP_NO_VTABLE AdditiveCipherAbstractPolicy
{
virtual unsigned int GetAlignment() const =0;
virtual unsigned int GetAlignment() const {return 1;}
virtual unsigned int GetBytesPerIteration() const =0;
virtual unsigned int GetOptimalBlockSize() const {return GetBytesPerIteration();}
virtual unsigned int GetIterationsToBuffer() const =0;
virtual void WriteKeystream(byte *keystreamBuffer, size_t iterationCount) =0;
virtual void WriteKeystream(byte *keystream, size_t iterationCount)
{OperateKeystream(KeystreamOperation(INPUT_NULL | (KeystreamOperationFlags)IsAlignedOn(keystream, GetAlignment())), keystream, NULL, iterationCount);}
virtual bool CanOperateKeystream() const {return false;}
virtual void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) {assert(false);}
virtual void CipherSetKey(const NameValuePairs &params, const byte *key, size_t length) =0;
@ -74,59 +83,62 @@ template <typename WT, unsigned int W, unsigned int X = 1, class BASE = Additive
struct CRYPTOPP_NO_VTABLE AdditiveCipherConcretePolicy : public BASE
{
typedef WT WordType;
CRYPTOPP_CONSTANT(BYTES_PER_ITERATION = sizeof(WordType) * W);
unsigned int GetAlignment() const {return sizeof(WordType);}
unsigned int GetBytesPerIteration() const {return sizeof(WordType) * W;}
#if !(CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64)
unsigned int GetAlignment() const {return GetAlignmentOf<WordType>();}
#endif
unsigned int GetBytesPerIteration() const {return BYTES_PER_ITERATION;}
unsigned int GetIterationsToBuffer() const {return X;}
void WriteKeystream(byte *buffer, size_t iterationCount)
{OperateKeystream(WRITE_KEYSTREAM, buffer, NULL, iterationCount);}
bool CanOperateKeystream() const {return true;}
virtual void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) =0;
template <class B>
struct KeystreamOutput
{
KeystreamOutput(KeystreamOperation operation, byte *output, const byte *input)
: m_operation(operation), m_output(output), m_input(input) {}
inline KeystreamOutput & operator()(WordType keystreamWord)
{
assert(IsAligned<WordType>(m_input));
assert(IsAligned<WordType>(m_output));
if (!NativeByteOrderIs(B::ToEnum()))
keystreamWord = ByteReverse(keystreamWord);
if (m_operation == WRITE_KEYSTREAM)
*(WordType*)m_output = keystreamWord;
else if (m_operation == XOR_KEYSTREAM)
{
*(WordType*)m_output = keystreamWord ^ *(WordType*)m_input;
m_input += sizeof(WordType);
}
else if (m_operation == XOR_KEYSTREAM_INPLACE)
*(WordType*)m_output ^= keystreamWord;
m_output += sizeof(WordType);
return *this;
}
KeystreamOperation m_operation;
byte *m_output;
const byte *m_input;
};
};
// use these to implement OperateKeystream
#define CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, b, i, a) \
PutWord(bool(x & OUTPUT_ALIGNED), b, output+i*sizeof(WordType), (x & INPUT_NULL) ? a : a ^ GetWord<WordType>(bool(x & INPUT_ALIGNED), b, input+i*sizeof(WordType)));
#define CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, i, a) {\
__m128i t = (x & INPUT_NULL) ? a : _mm_xor_si128(a, (x & INPUT_ALIGNED) ? _mm_load_si128((__m128i *)input+i) : _mm_loadu_si128((__m128i *)input+i));\
if (x & OUTPUT_ALIGNED) _mm_store_si128((__m128i *)output+i, t);\
else _mm_storeu_si128((__m128i *)output+i, t);}
#define CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(x, y) \
switch (operation) \
{ \
case WRITE_KEYSTREAM: \
x(WRITE_KEYSTREAM) \
break; \
case XOR_KEYSTREAM: \
x(XOR_KEYSTREAM) \
input += y; \
break; \
case XOR_KEYSTREAM_INPUT_ALIGNED: \
x(XOR_KEYSTREAM_INPUT_ALIGNED) \
input += y; \
break; \
case XOR_KEYSTREAM_OUTPUT_ALIGNED: \
x(XOR_KEYSTREAM_OUTPUT_ALIGNED) \
input += y; \
break; \
case WRITE_KEYSTREAM_ALIGNED: \
x(WRITE_KEYSTREAM_ALIGNED) \
break; \
case XOR_KEYSTREAM_BOTH_ALIGNED: \
x(XOR_KEYSTREAM_BOTH_ALIGNED) \
input += y; \
break; \
} \
output += y;
template <class BASE = AbstractPolicyHolder<AdditiveCipherAbstractPolicy, TwoBases<SymmetricCipher, RandomNumberGenerator> > >
class CRYPTOPP_NO_VTABLE AdditiveCipherTemplate : public BASE
{
public:
byte GenerateByte();
void GenerateBlock(byte *output, size_t size);
void ProcessData(byte *outString, const byte *inString, size_t length);
void GetNextIV(byte *iv) {this->AccessPolicy().CipherGetNextIV(iv);}
void Resynchronize(const byte *iv);
unsigned int OptimalBlockSize() const {return this->GetPolicy().GetBytesPerIteration();}
unsigned int OptimalBlockSize() const {return this->GetPolicy().GetOptimalBlockSize();}
unsigned int GetOptimalNextBlockSize() const {return (unsigned int)this->m_leftOver;}
unsigned int OptimalDataAlignment() const {return this->GetPolicy().GetAlignment();}
bool IsSelfInverting() const {return true;}