mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-27 03:40:22 +00:00
SSE2 optimizations
This commit is contained in:
parent
20833349d1
commit
bbbd09553b
392
panama.cpp
392
panama.cpp
@ -3,37 +3,296 @@
|
||||
#include "pch.h"
|
||||
#include "panama.h"
|
||||
#include "misc.h"
|
||||
#include "cpu.h"
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
template <class B>
|
||||
void Panama<B>::Reset()
|
||||
{
|
||||
m_bstart = 0;
|
||||
memset(m_state, 0, m_state.size()*4);
|
||||
memset(m_state, 0, m_state.SizeInBytes());
|
||||
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
|
||||
m_state[17] = HasSSSE3();
|
||||
#endif
|
||||
}
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
|
||||
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
|
||||
|
||||
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
|
||||
{
|
||||
#ifdef __GNUC__
|
||||
__asm__ __volatile__
|
||||
(
|
||||
".intel_syntax noprefix;"
|
||||
AS1( push ebx)
|
||||
#else
|
||||
AS2( mov ecx, count)
|
||||
AS2( mov esi, state)
|
||||
AS2( mov edi, z)
|
||||
AS2( mov edx, y)
|
||||
#endif
|
||||
AS2( shl ecx, 5)
|
||||
ASJ( jz, 5, f)
|
||||
AS2( mov ebx, [esi+4*17])
|
||||
AS2( add ecx, ebx)
|
||||
|
||||
AS1( push ebp)
|
||||
AS1( push ecx)
|
||||
|
||||
AS2( movdqa xmm0, [esi+0*16])
|
||||
AS2( movdqa xmm1, [esi+1*16])
|
||||
AS2( movdqa xmm2, [esi+2*16])
|
||||
AS2( movdqa xmm3, [esi+3*16])
|
||||
AS2( mov eax, [esi+4*16])
|
||||
|
||||
ASL(4)
|
||||
// gamma and pi
|
||||
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
|
||||
AS2( test ebx, 1)
|
||||
ASJ( jnz, 6, f)
|
||||
#endif
|
||||
AS2( movdqa xmm6, xmm2)
|
||||
AS2( movss xmm6, xmm3)
|
||||
ASS( pshufd xmm5, xmm6, 0, 3, 2, 1)
|
||||
AS2( movd xmm6, eax)
|
||||
AS2( movdqa xmm7, xmm3)
|
||||
AS2( movss xmm7, xmm6)
|
||||
ASS( pshufd xmm6, xmm7, 0, 3, 2, 1)
|
||||
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
|
||||
ASJ( jmp, 7, f)
|
||||
ASL(6)
|
||||
AS2( movdqa xmm5, xmm3)
|
||||
AS3( palignr xmm5, xmm2, 4)
|
||||
AS2( movd xmm6, eax)
|
||||
AS3( palignr xmm6, xmm3, 4)
|
||||
ASL(7)
|
||||
#endif
|
||||
|
||||
AS2( movd ecx, xmm2)
|
||||
AS1( not ecx)
|
||||
AS2( movd ebp, xmm3)
|
||||
AS2( or ecx, ebp)
|
||||
AS2( xor eax, ecx)
|
||||
|
||||
#define SSE2_Index(i) ASM_MOD(((i)*13+16), 17)
|
||||
|
||||
#define pi(i) \
|
||||
AS2( movd ecx, xmm7)\
|
||||
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
|
||||
AS2( mov [esi+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
|
||||
|
||||
#define pi4(x, y, z, a, b, c, d) \
|
||||
AS2( pcmpeqb xmm7, xmm7)\
|
||||
AS2( pxor xmm7, x)\
|
||||
AS2( por xmm7, y)\
|
||||
AS2( pxor xmm7, z)\
|
||||
pi(a)\
|
||||
ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\
|
||||
pi(b)\
|
||||
AS2( punpckhqdq xmm7, xmm7)\
|
||||
pi(c)\
|
||||
ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\
|
||||
pi(d)
|
||||
|
||||
pi4(xmm1, xmm2, xmm3, 1, 5, 9, 13)
|
||||
pi4(xmm0, xmm1, xmm2, 2, 6, 10, 14)
|
||||
pi4(xmm6, xmm0, xmm1, 3, 7, 11, 15)
|
||||
pi4(xmm5, xmm6, xmm0, 4, 8, 12, 16)
|
||||
|
||||
// output keystream and update buffer here to hide partial memory stalls between pi and theta
|
||||
AS2( movdqa xmm4, xmm3)
|
||||
AS2( punpcklqdq xmm3, xmm2) // 1 5 2 6
|
||||
AS2( punpckhdq xmm4, xmm2) // 9 10 13 14
|
||||
AS2( movdqa xmm2, xmm1)
|
||||
AS2( punpcklqdq xmm1, xmm0) // 3 7 4 8
|
||||
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
|
||||
|
||||
// keystream
|
||||
AS2( test edi, edi)
|
||||
ASJ( jz, 0, f)
|
||||
AS2( movdqa xmm6, xmm4)
|
||||
AS2( punpcklqdq xmm4, xmm2)
|
||||
AS2( punpckhqdq xmm6, xmm2)
|
||||
AS2( test edx, 0xf)
|
||||
ASJ( jnz, 2, f)
|
||||
AS2( test edx, edx)
|
||||
ASJ( jz, 1, f)
|
||||
AS2( pxor xmm4, [edx])
|
||||
AS2( pxor xmm6, [edx+16])
|
||||
AS2( add edx, 32)
|
||||
ASJ( jmp, 1, f)
|
||||
ASL(2)
|
||||
AS2( movdqu xmm0, [edx])
|
||||
AS2( movdqu xmm2, [edx+16])
|
||||
AS2( pxor xmm4, xmm0)
|
||||
AS2( pxor xmm6, xmm2)
|
||||
AS2( add edx, 32)
|
||||
ASL(1)
|
||||
AS2( test edi, 0xf)
|
||||
ASJ( jnz, 3, f)
|
||||
AS2( movdqa [edi], xmm4)
|
||||
AS2( movdqa [edi+16], xmm6)
|
||||
AS2( add edi, 32)
|
||||
ASJ( jmp, 0, f)
|
||||
ASL(3)
|
||||
AS2( movdqu [edi], xmm4)
|
||||
AS2( movdqu [edi+16], xmm6)
|
||||
AS2( add edi, 32)
|
||||
ASL(0)
|
||||
|
||||
// buffer update
|
||||
AS2( lea ecx, [ebx + 32])
|
||||
AS2( and ecx, 31*32)
|
||||
AS2( lea ebp, [ebx + (32-24)*32])
|
||||
AS2( and ebp, 31*32)
|
||||
|
||||
AS2( movdqa xmm0, [esi+20*4+ecx+0*8])
|
||||
AS2( pxor xmm3, xmm0)
|
||||
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
|
||||
AS2( movdqa [esi+20*4+ecx+0*8], xmm3)
|
||||
AS2( pxor xmm0, [esi+20*4+ebp+2*8])
|
||||
AS2( movdqa [esi+20*4+ebp+2*8], xmm0)
|
||||
|
||||
AS2( movdqa xmm4, [esi+20*4+ecx+2*8])
|
||||
AS2( pxor xmm1, xmm4)
|
||||
AS2( movdqa [esi+20*4+ecx+2*8], xmm1)
|
||||
AS2( pxor xmm4, [esi+20*4+ebp+0*8])
|
||||
AS2( movdqa [esi+20*4+ebp+0*8], xmm4)
|
||||
|
||||
// theta
|
||||
AS2( movdqa xmm3, [esi+3*16])
|
||||
AS2( movdqa xmm2, [esi+2*16])
|
||||
AS2( movdqa xmm1, [esi+1*16])
|
||||
AS2( movdqa xmm0, [esi+0*16])
|
||||
|
||||
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
|
||||
AS2( test ebx, 1)
|
||||
ASJ( jnz, 8, f)
|
||||
#endif
|
||||
AS2( movd xmm6, eax)
|
||||
AS2( movdqa xmm7, xmm3)
|
||||
AS2( movss xmm7, xmm6)
|
||||
AS2( movdqa xmm6, xmm2)
|
||||
AS2( movss xmm6, xmm3)
|
||||
AS2( movdqa xmm5, xmm1)
|
||||
AS2( movss xmm5, xmm2)
|
||||
AS2( movdqa xmm4, xmm0)
|
||||
AS2( movss xmm4, xmm1)
|
||||
ASS( pshufd xmm7, xmm7, 0, 3, 2, 1)
|
||||
ASS( pshufd xmm6, xmm6, 0, 3, 2, 1)
|
||||
ASS( pshufd xmm5, xmm5, 0, 3, 2, 1)
|
||||
ASS( pshufd xmm4, xmm4, 0, 3, 2, 1)
|
||||
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
|
||||
ASJ( jmp, 9, f)
|
||||
ASL(8)
|
||||
AS2( movd xmm7, eax)
|
||||
AS3( palignr xmm7, xmm3, 4)
|
||||
AS2( movq xmm6, xmm3)
|
||||
AS3( palignr xmm6, xmm2, 4)
|
||||
AS2( movq xmm5, xmm2)
|
||||
AS3( palignr xmm5, xmm1, 4)
|
||||
AS2( movq xmm4, xmm1)
|
||||
AS3( palignr xmm4, xmm0, 4)
|
||||
ASL(9)
|
||||
#endif
|
||||
|
||||
AS2( xor eax, 1)
|
||||
AS2( movd ecx, xmm0)
|
||||
AS2( xor eax, ecx)
|
||||
AS2( movd ecx, xmm3)
|
||||
AS2( xor eax, ecx)
|
||||
|
||||
AS2( pxor xmm3, xmm2)
|
||||
AS2( pxor xmm2, xmm1)
|
||||
AS2( pxor xmm1, xmm0)
|
||||
AS2( pxor xmm0, xmm7)
|
||||
AS2( pxor xmm3, xmm7)
|
||||
AS2( pxor xmm2, xmm6)
|
||||
AS2( pxor xmm1, xmm5)
|
||||
AS2( pxor xmm0, xmm4)
|
||||
|
||||
// sigma
|
||||
AS2( lea ecx, [ebx + (32-4)*32])
|
||||
AS2( and ecx, 31*32)
|
||||
AS2( lea ebp, [ebx + 16*32])
|
||||
AS2( and ebp, 31*32)
|
||||
|
||||
AS2( movdqa xmm4, [esi+20*4+ecx+0*16])
|
||||
AS2( movdqa xmm5, [esi+20*4+ebp+0*16])
|
||||
AS2( movdqa xmm6, xmm4)
|
||||
AS2( punpcklqdq xmm4, xmm5)
|
||||
AS2( punpckhqdq xmm6, xmm5)
|
||||
AS2( pxor xmm3, xmm4)
|
||||
AS2( pxor xmm2, xmm6)
|
||||
|
||||
AS2( movdqa xmm4, [esi+20*4+ecx+1*16])
|
||||
AS2( movdqa xmm5, [esi+20*4+ebp+1*16])
|
||||
AS2( movdqa xmm6, xmm4)
|
||||
AS2( punpcklqdq xmm4, xmm5)
|
||||
AS2( punpckhqdq xmm6, xmm5)
|
||||
AS2( pxor xmm1, xmm4)
|
||||
AS2( pxor xmm0, xmm6)
|
||||
|
||||
// loop
|
||||
AS2( add ebx, 32)
|
||||
AS2( cmp ebx, [esp])
|
||||
ASJ( jne, 4, b)
|
||||
|
||||
// save state
|
||||
AS2( mov ebp, [esp+4])
|
||||
AS2( add esp, 8)
|
||||
AS2( mov [esi+4*17], ebx)
|
||||
AS2( mov [esi+4*16], eax)
|
||||
AS2( movdqa [esi+3*16], xmm3)
|
||||
AS2( movdqa [esi+2*16], xmm2)
|
||||
AS2( movdqa [esi+1*16], xmm1)
|
||||
AS2( movdqa [esi+0*16], xmm0)
|
||||
ASL(5)
|
||||
|
||||
#ifdef __GNUC__
|
||||
AS1( pop ebx)
|
||||
".att_syntax prefix;"
|
||||
:
|
||||
: "c" (count), "S" (state), "D" (z), "d" (y)
|
||||
: "%eax", "memory", "cc"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <class B>
|
||||
void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y)
|
||||
{
|
||||
unsigned int bstart = m_bstart;
|
||||
word32 *const a = m_state;
|
||||
#define c (a+17)
|
||||
#define b ((Stage *)(a+34))
|
||||
word32 bstart = m_state[17];
|
||||
word32 *const aPtr = m_state;
|
||||
word32 cPtr[17];
|
||||
|
||||
#define bPtr ((byte *)(aPtr+20))
|
||||
|
||||
// reorder the state for SSE2
|
||||
// a and c: 4 8 12 16 | 3 7 11 15 | 2 6 10 14 | 1 5 9 13 | 0
|
||||
// xmm0 xmm1 xmm2 xmm3 eax
|
||||
#define a(i) aPtr[((i)*13+16) % 17] // 13 is inverse of 4 mod 17
|
||||
#define c(i) cPtr[((i)*13+16) % 17]
|
||||
// b: 0 4 | 1 5 | 2 6 | 3 7
|
||||
#define b(i, j) b##i[(j)*2%8 + (j)/4]
|
||||
|
||||
// output
|
||||
#define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a[i+9])
|
||||
#define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a[i+9])
|
||||
#define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a(i+9))
|
||||
#define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a(i+9))
|
||||
// buffer update
|
||||
#define US(i) {word32 t=b0[i]; b0[i]=ConditionalByteReverse(B::ToEnum(), p[i])^t; b25[(i+6)%8]^=t;}
|
||||
#define UL(i) {word32 t=b0[i]; b0[i]=a[i+1]^t; b25[(i+6)%8]^=t;}
|
||||
#define US(i) {word32 t=b(0,i); b(0,i)=ConditionalByteReverse(B::ToEnum(), p[i])^t; b(25,(i+6)%8)^=t;}
|
||||
#define UL(i) {word32 t=b(0,i); b(0,i)=a(i+1)^t; b(25,(i+6)%8)^=t;}
|
||||
// gamma and pi
|
||||
#define GP(i) c[5*i%17] = rotlFixed(a[i] ^ (a[(i+1)%17] | ~a[(i+2)%17]), ((5*i%17)*((5*i%17)+1)/2)%32)
|
||||
#define GP(i) c(5*i%17) = rotlFixed(a(i) ^ (a((i+1)%17) | ~a((i+2)%17)), ((5*i%17)*((5*i%17)+1)/2)%32)
|
||||
// theta and sigma
|
||||
#define T(i,x) a[i] = c[i] ^ c[(i+1)%17] ^ c[(i+4)%17] ^ x
|
||||
#define T(i,x) a(i) = c(i) ^ c((i+1)%17) ^ c((i+4)%17) ^ x
|
||||
#define TS1S(i) T(i+1, ConditionalByteReverse(B::ToEnum(), p[i]))
|
||||
#define TS1L(i) T(i+1, b4[i])
|
||||
#define TS2(i) T(i+9, b16[i])
|
||||
#define TS1L(i) T(i+1, b(4,i))
|
||||
#define TS2(i) T(i+9, b(16,i))
|
||||
|
||||
while (count--)
|
||||
{
|
||||
@ -51,12 +310,11 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *
|
||||
z += 8;
|
||||
}
|
||||
|
||||
word32 *const b16 = b[(bstart+16) % STAGES];
|
||||
word32 *const b4 = b[(bstart+4) % STAGES];
|
||||
bstart = (bstart + STAGES - 1) % STAGES;
|
||||
word32 *const b0 = b[bstart];
|
||||
word32 *const b25 = b[(bstart+25) % STAGES];
|
||||
|
||||
word32 *const b16 = (word32 *)(bPtr+((bstart+16*32) & 31*32));
|
||||
word32 *const b4 = (word32 *)(bPtr+((bstart+(32-4)*32) & 31*32));
|
||||
bstart += 32;
|
||||
word32 *const b0 = (word32 *)(bPtr+((bstart) & 31*32));
|
||||
word32 *const b25 = (word32 *)(bPtr+((bstart+(32-25)*32) & 31*32));
|
||||
|
||||
if (p)
|
||||
{
|
||||
@ -67,8 +325,23 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *
|
||||
UL(0); UL(1); UL(2); UL(3); UL(4); UL(5); UL(6); UL(7);
|
||||
}
|
||||
|
||||
GP(0); GP(1); GP(2); GP(3); GP(4); GP(5); GP(6); GP(7);
|
||||
GP(8); GP(9); GP(10); GP(11); GP(12); GP(13); GP(14); GP(15); GP(16);
|
||||
GP(0);
|
||||
GP(1);
|
||||
GP(2);
|
||||
GP(3);
|
||||
GP(4);
|
||||
GP(5);
|
||||
GP(6);
|
||||
GP(7);
|
||||
GP(8);
|
||||
GP(9);
|
||||
GP(10);
|
||||
GP(11);
|
||||
GP(12);
|
||||
GP(13);
|
||||
GP(14);
|
||||
GP(15);
|
||||
GP(16);
|
||||
|
||||
T(0,1);
|
||||
|
||||
@ -84,18 +357,18 @@ void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *
|
||||
|
||||
TS2(0); TS2(1); TS2(2); TS2(3); TS2(4); TS2(5); TS2(6); TS2(7);
|
||||
}
|
||||
m_bstart = bstart;
|
||||
m_state[17] = bstart;
|
||||
}
|
||||
|
||||
template <class B>
|
||||
size_t PanamaHash<B>::HashMultipleBlocks(const word32 *input, size_t length)
|
||||
size_t Weak::PanamaHash<B>::HashMultipleBlocks(const word32 *input, size_t length)
|
||||
{
|
||||
this->Iterate(length / this->BLOCKSIZE, input);
|
||||
return length % this->BLOCKSIZE;
|
||||
}
|
||||
|
||||
template <class B>
|
||||
void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
|
||||
void Weak::PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
|
||||
{
|
||||
this->ThrowIfInvalidTruncatedSize(size);
|
||||
|
||||
@ -105,8 +378,10 @@ void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
|
||||
|
||||
this->Iterate(32); // pull
|
||||
|
||||
ConditionalByteReverse(B::ToEnum(), this->m_state+9, this->m_state+9, DIGESTSIZE);
|
||||
memcpy(hash, this->m_state+9, size);
|
||||
FixedSizeSecBlock<word32, 8> buf;
|
||||
this->Iterate(1, NULL, buf, NULL);
|
||||
|
||||
memcpy(hash, buf, size);
|
||||
|
||||
this->Restart(); // reinit for next use
|
||||
}
|
||||
@ -114,31 +389,64 @@ void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
|
||||
template <class B>
|
||||
void PanamaCipherPolicy<B>::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length)
|
||||
{
|
||||
FixedSizeSecBlock<word32, 8> buf;
|
||||
|
||||
this->Reset();
|
||||
memcpy(buf, key, 32);
|
||||
this->Iterate(1, buf);
|
||||
if (length == 64)
|
||||
memcpy(buf, key+32, 32);
|
||||
else
|
||||
memset(buf, 0, 32);
|
||||
this->Iterate(1, buf);
|
||||
|
||||
this->Iterate(32);
|
||||
assert(length==32);
|
||||
memcpy(m_key, key, 32);
|
||||
}
|
||||
|
||||
template <class B>
|
||||
void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
|
||||
{
|
||||
this->Reset();
|
||||
this->Iterate(1, m_key);
|
||||
if (iv && IsAligned<word32>(iv))
|
||||
this->Iterate(1, (const word32 *)iv);
|
||||
else
|
||||
{
|
||||
FixedSizeSecBlock<word32, 8> buf;
|
||||
if (iv)
|
||||
memcpy(buf, iv, 32);
|
||||
else
|
||||
memset(buf, 0, 32);
|
||||
this->Iterate(1, buf);
|
||||
}
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
|
||||
Panama_SSE2_Pull(32, this->m_state, NULL, NULL);
|
||||
else
|
||||
#endif
|
||||
this->Iterate(32);
|
||||
}
|
||||
|
||||
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
|
||||
template <class B>
|
||||
unsigned int PanamaCipherPolicy<B>::GetAlignment() const
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
|
||||
return 16;
|
||||
else
|
||||
#endif
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class B>
|
||||
void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
|
||||
{
|
||||
this->Iterate(iterationCount, NULL, (word32 *)output, (const word32 *)input);
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
|
||||
Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input);
|
||||
else
|
||||
#endif
|
||||
this->Iterate(iterationCount, NULL, (word32 *)output, (const word32 *)input);
|
||||
}
|
||||
|
||||
template class Panama<BigEndian>;
|
||||
template class Panama<LittleEndian>;
|
||||
|
||||
template class PanamaHash<BigEndian>;
|
||||
template class PanamaHash<LittleEndian>;
|
||||
template class Weak::PanamaHash<BigEndian>;
|
||||
template class Weak::PanamaHash<LittleEndian>;
|
||||
|
||||
template class PanamaCipherPolicy<BigEndian>;
|
||||
template class PanamaCipherPolicy<LittleEndian>;
|
||||
|
20
panama.h
20
panama.h
@ -1,8 +1,6 @@
|
||||
#ifndef CRYPTOPP_PANAMA_H
|
||||
#define CRYPTOPP_PANAMA_H
|
||||
|
||||
#include "seckey.h"
|
||||
#include "secblock.h"
|
||||
#include "strciphr.h"
|
||||
#include "iterhash.h"
|
||||
|
||||
@ -20,10 +18,10 @@ protected:
|
||||
typedef word32 Stage[8];
|
||||
CRYPTOPP_CONSTANT(STAGES = 32)
|
||||
|
||||
FixedSizeSecBlock<word32, 17*2 + 32*sizeof(Stage)> m_state;
|
||||
unsigned int m_bstart;
|
||||
FixedSizeAlignedSecBlock<word32, 20 + 8*32> m_state;
|
||||
};
|
||||
|
||||
namespace Weak {
|
||||
/// <a href="http://www.weidai.com/scan-mirror/md.html#Panama">Panama Hash</a>
|
||||
template <class B = LittleEndian>
|
||||
class PanamaHash : protected Panama<B>, public AlgorithmImpl<IteratedHash<word32, NativeByteOrder, 32>, PanamaHash<B> >
|
||||
@ -39,7 +37,9 @@ protected:
|
||||
void Init() {Panama<B>::Reset();}
|
||||
void HashEndianCorrectedBlock(const word32 *data) {this->Iterate(1, data);} // push
|
||||
size_t HashMultipleBlocks(const word32 *input, size_t length);
|
||||
word32* StateBuf() {return NULL;}
|
||||
};
|
||||
}
|
||||
|
||||
//! MAC construction using a hermetic hash function
|
||||
template <class T_Hash, class T_Info = T_Hash>
|
||||
@ -94,6 +94,7 @@ protected:
|
||||
SecByteBlock m_key;
|
||||
};
|
||||
|
||||
namespace Weak {
|
||||
/// Panama MAC
|
||||
template <class B = LittleEndian>
|
||||
class PanamaMAC : public HermeticHashFunctionMAC<PanamaHash<B> >
|
||||
@ -103,10 +104,11 @@ public:
|
||||
PanamaMAC(const byte *key, unsigned int length)
|
||||
{this->SetKey(key, length);}
|
||||
};
|
||||
}
|
||||
|
||||
//! algorithm info
|
||||
template <class B>
|
||||
struct PanamaCipherInfo : public VariableKeyLength<32, 32, 64, 32, SimpleKeyingInterface::NOT_RESYNCHRONIZABLE>
|
||||
struct PanamaCipherInfo : public FixedKeyLength<32, SimpleKeyingInterface::UNIQUE_IV, 32>
|
||||
{
|
||||
static const char * StaticAlgorithmName() {return B::ToEnum() == BIG_ENDIAN_ORDER ? "Panama-BE" : "Panama-LE";}
|
||||
};
|
||||
@ -121,9 +123,15 @@ protected:
|
||||
void CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length);
|
||||
void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount);
|
||||
bool IsRandomAccess() const {return false;}
|
||||
void CipherResynchronize(byte *keystreamBuffer, const byte *iv);
|
||||
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
|
||||
unsigned int GetAlignment() const;
|
||||
#endif
|
||||
|
||||
FixedSizeSecBlock<word32, 8> m_key;
|
||||
};
|
||||
|
||||
//! <a href="http://www.weidai.com/scan-mirror/cs.html#Panama">Panama Stream Cipher</a>
|
||||
//! <a href="http://www.cryptolounge.org/wiki/PANAMA">Panama Stream Cipher</a>
|
||||
template <class B = LittleEndian>
|
||||
struct PanamaCipher : public PanamaCipherInfo<B>, public SymmetricCipherDocumentation
|
||||
{
|
||||
|
367
salsa.cpp
367
salsa.cpp
@ -4,6 +4,9 @@
|
||||
#include "salsa.h"
|
||||
#include "misc.h"
|
||||
#include "argnames.h"
|
||||
#include "cpu.h"
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
@ -14,11 +17,13 @@ void Salsa20_TestInstantiations()
|
||||
|
||||
void Salsa20_Policy::CipherGetNextIV(byte *IV)
|
||||
{
|
||||
word32 j6 = m_state[6] + 1;
|
||||
word32 j7 = m_state[7] + (j6 == 0);
|
||||
word32 j6, j7;
|
||||
|
||||
UnalignedPutWord(LITTLE_ENDIAN_ORDER, IV, j6);
|
||||
UnalignedPutWord(LITTLE_ENDIAN_ORDER, IV+4, j7);
|
||||
j6 = m_state[14] + 1;
|
||||
j7 = m_state[11] + (j6 == 0);
|
||||
|
||||
PutWord(false, LITTLE_ENDIAN_ORDER, IV, j6);
|
||||
PutWord(false, LITTLE_ENDIAN_ORDER, IV+4, j7);
|
||||
}
|
||||
|
||||
void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length)
|
||||
@ -28,112 +33,304 @@ void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key,
|
||||
if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
|
||||
throw InvalidRounds(StaticAlgorithmName(), m_rounds);
|
||||
|
||||
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+1, 4, key, 16);
|
||||
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+11, 4, key + length - 16, 16);
|
||||
// m_state is reordered for SSE2
|
||||
GetBlock<word32, LittleEndian, false> get1(key);
|
||||
get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
|
||||
GetBlock<word32, LittleEndian, false> get2(key + length - 16);
|
||||
get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
|
||||
|
||||
// m_state[0,5,10,15] forms "expand 16-byte k" or "expand 32-byte k"
|
||||
// "expand 16-byte k" or "expand 32-byte k"
|
||||
m_state[0] = 0x61707865;
|
||||
m_state[5] = (length == 16) ? 0x3120646e : 0x3320646e;
|
||||
m_state[10] = (length == 16) ? 0x79622d36 : 0x79622d32;
|
||||
m_state[15] = 0x6b206574;
|
||||
m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
|
||||
m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
|
||||
m_state[3] = 0x6b206574;
|
||||
}
|
||||
|
||||
void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV)
|
||||
{
|
||||
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+6, 4, IV, 8);
|
||||
GetBlock<word32, LittleEndian, false> get(IV);
|
||||
get(m_state[14])(m_state[11]);
|
||||
m_state[8] = m_state[5] = 0;
|
||||
}
|
||||
|
||||
void Salsa20_Policy::SeekToIteration(lword iterationCount)
|
||||
{
|
||||
m_state[8] = (word32)iterationCount;
|
||||
m_state[9] = (word32)SafeRightShift<32>(iterationCount);
|
||||
m_state[5] = (word32)SafeRightShift<32>(iterationCount);
|
||||
}
|
||||
|
||||
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
|
||||
unsigned int Salsa20_Policy::GetAlignment() const
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
if (HasSSE2())
|
||||
return 16;
|
||||
else
|
||||
#endif
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned int Salsa20_Policy::GetOptimalBlockSize() const
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
if (HasSSE2())
|
||||
return 4*BYTES_PER_ITERATION;
|
||||
else
|
||||
#endif
|
||||
return BYTES_PER_ITERATION;
|
||||
}
|
||||
#endif
|
||||
|
||||
void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
|
||||
{
|
||||
KeystreamOutput<LittleEndian> keystreamOutput(operation, output, input);
|
||||
|
||||
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
word32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
|
||||
|
||||
j0 = m_state[0];
|
||||
j1 = m_state[1];
|
||||
j2 = m_state[2];
|
||||
j3 = m_state[3];
|
||||
j4 = m_state[4];
|
||||
j5 = m_state[5];
|
||||
j6 = m_state[6];
|
||||
j7 = m_state[7];
|
||||
j8 = m_state[8];
|
||||
j9 = m_state[9];
|
||||
j10 = m_state[10];
|
||||
j11 = m_state[11];
|
||||
j12 = m_state[12];
|
||||
j13 = m_state[13];
|
||||
j14 = m_state[14];
|
||||
j15 = m_state[15];
|
||||
|
||||
for (size_t iteration = 0; iteration < iterationCount; ++iteration)
|
||||
int i;
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
if (HasSSE2())
|
||||
{
|
||||
x0 = j0;
|
||||
x1 = j1;
|
||||
x2 = j2;
|
||||
x3 = j3;
|
||||
x4 = j4;
|
||||
x5 = j5;
|
||||
x6 = j6;
|
||||
x7 = j7;
|
||||
x8 = j8;
|
||||
x9 = j9;
|
||||
x10 = j10;
|
||||
x11 = j11;
|
||||
x12 = j12;
|
||||
x13 = j13;
|
||||
x14 = j14;
|
||||
x15 = j15;
|
||||
__m128i *s = (__m128i *)m_state.data();
|
||||
|
||||
for (int i=m_rounds; i>0; i-=2)
|
||||
if (iterationCount >= 4)
|
||||
{
|
||||
#define QUARTER_ROUND(a, b, c, d) \
|
||||
b = b ^ rotlFixed(a + d, 7); \
|
||||
c = c ^ rotlFixed(b + a, 9); \
|
||||
d = d ^ rotlFixed(c + b, 13); \
|
||||
a = a ^ rotlFixed(d + c, 18);
|
||||
__m128i ss[16];
|
||||
ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0));
|
||||
ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1));
|
||||
ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2));
|
||||
ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3));
|
||||
ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0));
|
||||
ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2));
|
||||
ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3));
|
||||
ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1));
|
||||
ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2));
|
||||
ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3));
|
||||
ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0));
|
||||
ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1));
|
||||
ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2));
|
||||
ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
QUARTER_ROUND(x0, x4, x8, x12)
|
||||
QUARTER_ROUND(x5, x9, x13, x1)
|
||||
QUARTER_ROUND(x10, x14, x2, x6)
|
||||
QUARTER_ROUND(x15, x3, x7, x11)
|
||||
do
|
||||
{
|
||||
word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]);
|
||||
for (i=0; i<4; i++)
|
||||
{
|
||||
countersLo[i] = m_state[8];
|
||||
countersHi[i] = m_state[5];
|
||||
if (++m_state[8] == 0)
|
||||
++m_state[5];
|
||||
}
|
||||
|
||||
QUARTER_ROUND(x0, x1, x2, x3)
|
||||
QUARTER_ROUND(x5, x6, x7, x4)
|
||||
QUARTER_ROUND(x10, x11, x8, x9)
|
||||
QUARTER_ROUND(x15, x12, x13, x14)
|
||||
__m128i x0 = ss[0];
|
||||
__m128i x1 = ss[1];
|
||||
__m128i x2 = ss[2];
|
||||
__m128i x3 = ss[3];
|
||||
__m128i x4 = ss[4];
|
||||
__m128i x5 = ss[5];
|
||||
__m128i x6 = ss[6];
|
||||
__m128i x7 = ss[7];
|
||||
__m128i x8 = ss[8];
|
||||
__m128i x9 = ss[9];
|
||||
__m128i x10 = ss[10];
|
||||
__m128i x11 = ss[11];
|
||||
__m128i x12 = ss[12];
|
||||
__m128i x13 = ss[13];
|
||||
__m128i x14 = ss[14];
|
||||
__m128i x15 = ss[15];
|
||||
|
||||
for (i=m_rounds; i>0; i-=2)
|
||||
{
|
||||
#define SSE2_QUARTER_ROUND(a, b, d, i) {\
|
||||
__m128i t = _mm_add_epi32(a, d); \
|
||||
b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \
|
||||
b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));}
|
||||
|
||||
#define QUARTER_ROUND(a, b, c, d) \
|
||||
SSE2_QUARTER_ROUND(a, b, d, 7) \
|
||||
SSE2_QUARTER_ROUND(b, c, a, 9) \
|
||||
SSE2_QUARTER_ROUND(c, d, b, 13) \
|
||||
SSE2_QUARTER_ROUND(d, a, c, 18)
|
||||
|
||||
QUARTER_ROUND(x0, x4, x8, x12)
|
||||
QUARTER_ROUND(x1, x5, x9, x13)
|
||||
QUARTER_ROUND(x2, x6, x10, x14)
|
||||
QUARTER_ROUND(x3, x7, x11, x15)
|
||||
|
||||
QUARTER_ROUND(x0, x13, x10, x7)
|
||||
QUARTER_ROUND(x1, x14, x11, x4)
|
||||
QUARTER_ROUND(x2, x15, x8, x5)
|
||||
QUARTER_ROUND(x3, x12, x9, x6)
|
||||
|
||||
#undef QUARTER_ROUND
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, ss[0]);
|
||||
x1 = _mm_add_epi32(x1, ss[1]);
|
||||
x2 = _mm_add_epi32(x2, ss[2]);
|
||||
x3 = _mm_add_epi32(x3, ss[3]);
|
||||
x4 = _mm_add_epi32(x4, ss[4]);
|
||||
x5 = _mm_add_epi32(x5, ss[5]);
|
||||
x6 = _mm_add_epi32(x6, ss[6]);
|
||||
x7 = _mm_add_epi32(x7, ss[7]);
|
||||
x8 = _mm_add_epi32(x8, ss[8]);
|
||||
x9 = _mm_add_epi32(x9, ss[9]);
|
||||
x10 = _mm_add_epi32(x10, ss[10]);
|
||||
x11 = _mm_add_epi32(x11, ss[11]);
|
||||
x12 = _mm_add_epi32(x12, ss[12]);
|
||||
x13 = _mm_add_epi32(x13, ss[13]);
|
||||
x14 = _mm_add_epi32(x14, ss[14]);
|
||||
x15 = _mm_add_epi32(x15, ss[15]);
|
||||
|
||||
#define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\
|
||||
__m128i t0 = _mm_unpacklo_epi32(a, b);\
|
||||
__m128i t1 = _mm_unpacklo_epi32(c, d);\
|
||||
__m128i t2 = _mm_unpacklo_epi64(t0, t1);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\
|
||||
t2 = _mm_unpackhi_epi64(t0, t1);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\
|
||||
t0 = _mm_unpackhi_epi32(a, b);\
|
||||
t1 = _mm_unpackhi_epi32(c, d);\
|
||||
t2 = _mm_unpacklo_epi64(t0, t1);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\
|
||||
t2 = _mm_unpackhi_epi64(t0, t1);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)}
|
||||
|
||||
#define SALSA_OUTPUT(x) \
|
||||
OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\
|
||||
OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\
|
||||
OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\
|
||||
OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15)
|
||||
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION)
|
||||
|
||||
#undef SALSA_OUTPUT
|
||||
} while ((iterationCount-=4) >= 4);
|
||||
}
|
||||
|
||||
keystreamOutput (x0 + j0)
|
||||
(x1 + j1)
|
||||
(x2 + j2)
|
||||
(x3 + j3)
|
||||
(x4 + j4)
|
||||
(x5 + j5)
|
||||
(x6 + j6)
|
||||
(x7 + j7)
|
||||
(x8 + j8)
|
||||
(x9 + j9)
|
||||
(x10 + j10)
|
||||
(x11 + j11)
|
||||
(x12 + j12)
|
||||
(x13 + j13)
|
||||
(x14 + j14)
|
||||
(x15 + j15);
|
||||
if (!IsP4()) while (iterationCount)
|
||||
{
|
||||
--iterationCount;
|
||||
__m128i x0 = s[0];
|
||||
__m128i x1 = s[1];
|
||||
__m128i x2 = s[2];
|
||||
__m128i x3 = s[3];
|
||||
|
||||
if (++j8 == 0)
|
||||
++j9;
|
||||
for (i=m_rounds; i>0; i-=2)
|
||||
{
|
||||
SSE2_QUARTER_ROUND(x0, x1, x3, 7)
|
||||
SSE2_QUARTER_ROUND(x1, x2, x0, 9)
|
||||
SSE2_QUARTER_ROUND(x2, x3, x1, 13)
|
||||
SSE2_QUARTER_ROUND(x3, x0, x2, 18)
|
||||
|
||||
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
|
||||
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
|
||||
SSE2_QUARTER_ROUND(x0, x3, x1, 7)
|
||||
SSE2_QUARTER_ROUND(x3, x2, x0, 9)
|
||||
SSE2_QUARTER_ROUND(x2, x1, x3, 13)
|
||||
SSE2_QUARTER_ROUND(x1, x0, x2, 18)
|
||||
|
||||
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
|
||||
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, s[0]);
|
||||
x1 = _mm_add_epi32(x1, s[1]);
|
||||
x2 = _mm_add_epi32(x2, s[2]);
|
||||
x3 = _mm_add_epi32(x3, s[3]);
|
||||
|
||||
if (++m_state[8] == 0)
|
||||
++m_state[5];
|
||||
|
||||
CRYPTOPP_ALIGN_DATA(16) static const word32 masks[8] CRYPTOPP_SECTION_ALIGN16 =
|
||||
{0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 0, 0xffffffff, 0};
|
||||
|
||||
__m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32));
|
||||
k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
|
||||
__m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32));
|
||||
k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
|
||||
__m128i maskLo32 = ((__m128i*)masks)[1], maskHi32 = ((__m128i*)masks)[0];
|
||||
__m128i k20 = _mm_or_si128(_mm_and_si128(x2, maskLo32), _mm_and_si128(x1, maskHi32));
|
||||
__m128i k31 = _mm_or_si128(_mm_and_si128(x3, maskLo32), _mm_and_si128(x2, maskHi32));
|
||||
|
||||
__m128i k0 = _mm_unpackhi_epi64(k02, k20);
|
||||
__m128i k1 = _mm_unpackhi_epi64(k13, k31);
|
||||
__m128i k2 = _mm_unpacklo_epi64(k20, k02);
|
||||
__m128i k3 = _mm_unpacklo_epi64(k31, k13);
|
||||
|
||||
#define SSE2_OUTPUT(x) {\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)}
|
||||
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
m_state[8] = j8;
|
||||
m_state[9] = j9;
|
||||
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
|
||||
while (iterationCount--)
|
||||
{
|
||||
x0 = m_state[0];
|
||||
x1 = m_state[1];
|
||||
x2 = m_state[2];
|
||||
x3 = m_state[3];
|
||||
x4 = m_state[4];
|
||||
x5 = m_state[5];
|
||||
x6 = m_state[6];
|
||||
x7 = m_state[7];
|
||||
x8 = m_state[8];
|
||||
x9 = m_state[9];
|
||||
x10 = m_state[10];
|
||||
x11 = m_state[11];
|
||||
x12 = m_state[12];
|
||||
x13 = m_state[13];
|
||||
x14 = m_state[14];
|
||||
x15 = m_state[15];
|
||||
|
||||
for (i=m_rounds; i>0; i-=2)
|
||||
{
|
||||
#define QUARTER_ROUND(a, b, c, d) \
|
||||
b = b ^ rotlFixed(a + d, 7); \
|
||||
c = c ^ rotlFixed(b + a, 9); \
|
||||
d = d ^ rotlFixed(c + b, 13); \
|
||||
a = a ^ rotlFixed(d + c, 18);
|
||||
|
||||
QUARTER_ROUND(x0, x4, x8, x12)
|
||||
QUARTER_ROUND(x1, x5, x9, x13)
|
||||
QUARTER_ROUND(x2, x6, x10, x14)
|
||||
QUARTER_ROUND(x3, x7, x11, x15)
|
||||
|
||||
QUARTER_ROUND(x0, x13, x10, x7)
|
||||
QUARTER_ROUND(x1, x14, x11, x4)
|
||||
QUARTER_ROUND(x2, x15, x8, x5)
|
||||
QUARTER_ROUND(x3, x12, x9, x6)
|
||||
}
|
||||
|
||||
#define SALSA_OUTPUT(x) {\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
|
||||
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
|
||||
|
||||
if (++m_state[8] == 0)
|
||||
++m_state[5];
|
||||
}
|
||||
}
|
||||
|
||||
NAMESPACE_END
|
||||
|
10
salsa.h
10
salsa.h
@ -8,7 +8,7 @@
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
//! _
|
||||
struct Salsa20_Info : public VariableKeyLength<32, 16, 32, 16, SimpleKeyingInterface::STRUCTURED_IV, 8>
|
||||
struct Salsa20_Info : public VariableKeyLength<32, 16, 32, 16, SimpleKeyingInterface::UNIQUE_IV, 8>
|
||||
{
|
||||
static const char *StaticAlgorithmName() {return "Salsa20";}
|
||||
};
|
||||
@ -22,13 +22,17 @@ protected:
|
||||
void CipherResynchronize(byte *keystreamBuffer, const byte *IV);
|
||||
bool IsRandomAccess() const {return true;}
|
||||
void SeekToIteration(lword iterationCount);
|
||||
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
|
||||
unsigned int GetAlignment() const;
|
||||
unsigned int GetOptimalBlockSize() const;
|
||||
#endif
|
||||
|
||||
private:
|
||||
FixedSizeAlignedSecBlock<word32, 16> m_state;
|
||||
int m_rounds;
|
||||
FixedSizeSecBlock<word32, 16> m_state;
|
||||
};
|
||||
|
||||
//! Salsa20, variable rounds: 8, 12 or 20 (default 20)
|
||||
/// <a href="http://www.cryptolounge.org/wiki/Salsa20">Salsa20</a>, variable rounds: 8, 12 or 20 (default 20)
|
||||
struct Salsa20 : public Salsa20_Info, public SymmetricCipherDocumentation
|
||||
{
|
||||
typedef SymmetricCipherFinal<ConcretePolicyHolder<Salsa20_Policy, AdditiveCipherTemplate<> >, Salsa20_Info> Encryption;
|
||||
|
80
strciphr.cpp
80
strciphr.cpp
@ -37,6 +37,57 @@ byte AdditiveCipherTemplate<S>::GenerateByte()
|
||||
return *(KeystreamBufferEnd()-m_leftOver--);
|
||||
}
|
||||
|
||||
template <class S>
|
||||
void AdditiveCipherTemplate<S>::GenerateBlock(byte *outString, size_t length)
|
||||
{
|
||||
if (m_leftOver > 0)
|
||||
{
|
||||
size_t len = STDMIN(m_leftOver, length);
|
||||
memcpy(outString, KeystreamBufferEnd()-m_leftOver, len);
|
||||
length -= len;
|
||||
m_leftOver -= len;
|
||||
outString += len;
|
||||
|
||||
if (!length)
|
||||
return;
|
||||
}
|
||||
assert(m_leftOver == 0);
|
||||
|
||||
PolicyInterface &policy = this->AccessPolicy();
|
||||
unsigned int bytesPerIteration = policy.GetBytesPerIteration();
|
||||
|
||||
if (length >= bytesPerIteration)
|
||||
{
|
||||
size_t iterations = length / bytesPerIteration;
|
||||
|
||||
policy.WriteKeystream(outString, iterations);
|
||||
|
||||
outString += iterations * bytesPerIteration;
|
||||
length -= iterations * bytesPerIteration;
|
||||
|
||||
if (!length)
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned int bufferByteSize = GetBufferByteSize(policy);
|
||||
unsigned int bufferIterations = policy.GetIterationsToBuffer();
|
||||
|
||||
while (length >= bufferByteSize)
|
||||
{
|
||||
policy.WriteKeystream(m_buffer, bufferIterations);
|
||||
memcpy(outString, KeystreamBufferBegin(), bufferByteSize);
|
||||
length -= bufferByteSize;
|
||||
outString += bufferByteSize;
|
||||
}
|
||||
|
||||
if (length > 0)
|
||||
{
|
||||
policy.WriteKeystream(m_buffer, bufferIterations);
|
||||
memcpy(outString, KeystreamBufferBegin(), length);
|
||||
m_leftOver = bytesPerIteration - length;
|
||||
}
|
||||
}
|
||||
|
||||
template <class S>
|
||||
void AdditiveCipherTemplate<S>::ProcessData(byte *outString, const byte *inString, size_t length)
|
||||
{
|
||||
@ -48,29 +99,26 @@ void AdditiveCipherTemplate<S>::ProcessData(byte *outString, const byte *inStrin
|
||||
m_leftOver -= len;
|
||||
inString += len;
|
||||
outString += len;
|
||||
|
||||
if (!length)
|
||||
return;
|
||||
}
|
||||
|
||||
if (!length)
|
||||
return;
|
||||
|
||||
assert(m_leftOver == 0);
|
||||
|
||||
PolicyInterface &policy = this->AccessPolicy();
|
||||
unsigned int bytesPerIteration = policy.GetBytesPerIteration();
|
||||
unsigned int alignment = policy.GetAlignment();
|
||||
|
||||
if (policy.CanOperateKeystream() && length >= bytesPerIteration && IsAlignedOn(outString, alignment))
|
||||
if (policy.CanOperateKeystream() && length >= bytesPerIteration)
|
||||
{
|
||||
if (IsAlignedOn(inString, alignment))
|
||||
policy.OperateKeystream(XOR_KEYSTREAM, outString, inString, length / bytesPerIteration);
|
||||
else
|
||||
{
|
||||
memcpy(outString, inString, length);
|
||||
policy.OperateKeystream(XOR_KEYSTREAM_INPLACE, outString, outString, length / bytesPerIteration);
|
||||
}
|
||||
inString += length - length % bytesPerIteration;
|
||||
outString += length - length % bytesPerIteration;
|
||||
length %= bytesPerIteration;
|
||||
size_t iterations = length / bytesPerIteration;
|
||||
unsigned int alignment = policy.GetAlignment();
|
||||
KeystreamOperation operation = KeystreamOperation((IsAlignedOn(inString, alignment) * 2) | (int)IsAlignedOn(outString, alignment));
|
||||
|
||||
policy.OperateKeystream(operation, outString, inString, iterations);
|
||||
|
||||
inString += iterations * bytesPerIteration;
|
||||
outString += iterations * bytesPerIteration;
|
||||
length -= iterations * bytesPerIteration;
|
||||
|
||||
if (!length)
|
||||
return;
|
||||
|
96
strciphr.h
96
strciphr.h
@ -53,14 +53,23 @@ protected:
|
||||
POLICY_INTERFACE & AccessPolicy() {return *this;}
|
||||
};
|
||||
|
||||
enum KeystreamOperation {WRITE_KEYSTREAM, XOR_KEYSTREAM, XOR_KEYSTREAM_INPLACE};
|
||||
enum KeystreamOperationFlags {OUTPUT_ALIGNED=1, INPUT_ALIGNED=2, INPUT_NULL = 4};
|
||||
enum KeystreamOperation {
|
||||
WRITE_KEYSTREAM = INPUT_NULL,
|
||||
WRITE_KEYSTREAM_ALIGNED = INPUT_NULL | OUTPUT_ALIGNED,
|
||||
XOR_KEYSTREAM = 0,
|
||||
XOR_KEYSTREAM_INPUT_ALIGNED = INPUT_ALIGNED,
|
||||
XOR_KEYSTREAM_OUTPUT_ALIGNED= OUTPUT_ALIGNED,
|
||||
XOR_KEYSTREAM_BOTH_ALIGNED = OUTPUT_ALIGNED | INPUT_ALIGNED};
|
||||
|
||||
struct CRYPTOPP_DLL CRYPTOPP_NO_VTABLE AdditiveCipherAbstractPolicy
|
||||
{
|
||||
virtual unsigned int GetAlignment() const =0;
|
||||
virtual unsigned int GetAlignment() const {return 1;}
|
||||
virtual unsigned int GetBytesPerIteration() const =0;
|
||||
virtual unsigned int GetOptimalBlockSize() const {return GetBytesPerIteration();}
|
||||
virtual unsigned int GetIterationsToBuffer() const =0;
|
||||
virtual void WriteKeystream(byte *keystreamBuffer, size_t iterationCount) =0;
|
||||
virtual void WriteKeystream(byte *keystream, size_t iterationCount)
|
||||
{OperateKeystream(KeystreamOperation(INPUT_NULL | (KeystreamOperationFlags)IsAlignedOn(keystream, GetAlignment())), keystream, NULL, iterationCount);}
|
||||
virtual bool CanOperateKeystream() const {return false;}
|
||||
virtual void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) {assert(false);}
|
||||
virtual void CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) =0;
|
||||
@ -74,59 +83,62 @@ template <typename WT, unsigned int W, unsigned int X = 1, class BASE = Additive
|
||||
struct CRYPTOPP_NO_VTABLE AdditiveCipherConcretePolicy : public BASE
|
||||
{
|
||||
typedef WT WordType;
|
||||
CRYPTOPP_CONSTANT(BYTES_PER_ITERATION = sizeof(WordType) * W);
|
||||
|
||||
unsigned int GetAlignment() const {return sizeof(WordType);}
|
||||
unsigned int GetBytesPerIteration() const {return sizeof(WordType) * W;}
|
||||
#if !(CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64)
|
||||
unsigned int GetAlignment() const {return GetAlignmentOf<WordType>();}
|
||||
#endif
|
||||
unsigned int GetBytesPerIteration() const {return BYTES_PER_ITERATION;}
|
||||
unsigned int GetIterationsToBuffer() const {return X;}
|
||||
void WriteKeystream(byte *buffer, size_t iterationCount)
|
||||
{OperateKeystream(WRITE_KEYSTREAM, buffer, NULL, iterationCount);}
|
||||
bool CanOperateKeystream() const {return true;}
|
||||
virtual void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) =0;
|
||||
|
||||
template <class B>
|
||||
struct KeystreamOutput
|
||||
{
|
||||
KeystreamOutput(KeystreamOperation operation, byte *output, const byte *input)
|
||||
: m_operation(operation), m_output(output), m_input(input) {}
|
||||
|
||||
inline KeystreamOutput & operator()(WordType keystreamWord)
|
||||
{
|
||||
assert(IsAligned<WordType>(m_input));
|
||||
assert(IsAligned<WordType>(m_output));
|
||||
|
||||
if (!NativeByteOrderIs(B::ToEnum()))
|
||||
keystreamWord = ByteReverse(keystreamWord);
|
||||
|
||||
if (m_operation == WRITE_KEYSTREAM)
|
||||
*(WordType*)m_output = keystreamWord;
|
||||
else if (m_operation == XOR_KEYSTREAM)
|
||||
{
|
||||
*(WordType*)m_output = keystreamWord ^ *(WordType*)m_input;
|
||||
m_input += sizeof(WordType);
|
||||
}
|
||||
else if (m_operation == XOR_KEYSTREAM_INPLACE)
|
||||
*(WordType*)m_output ^= keystreamWord;
|
||||
|
||||
m_output += sizeof(WordType);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
KeystreamOperation m_operation;
|
||||
byte *m_output;
|
||||
const byte *m_input;
|
||||
};
|
||||
};
|
||||
|
||||
// use these to implement OperateKeystream
|
||||
#define CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, b, i, a) \
|
||||
PutWord(bool(x & OUTPUT_ALIGNED), b, output+i*sizeof(WordType), (x & INPUT_NULL) ? a : a ^ GetWord<WordType>(bool(x & INPUT_ALIGNED), b, input+i*sizeof(WordType)));
|
||||
#define CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, i, a) {\
|
||||
__m128i t = (x & INPUT_NULL) ? a : _mm_xor_si128(a, (x & INPUT_ALIGNED) ? _mm_load_si128((__m128i *)input+i) : _mm_loadu_si128((__m128i *)input+i));\
|
||||
if (x & OUTPUT_ALIGNED) _mm_store_si128((__m128i *)output+i, t);\
|
||||
else _mm_storeu_si128((__m128i *)output+i, t);}
|
||||
#define CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(x, y) \
|
||||
switch (operation) \
|
||||
{ \
|
||||
case WRITE_KEYSTREAM: \
|
||||
x(WRITE_KEYSTREAM) \
|
||||
break; \
|
||||
case XOR_KEYSTREAM: \
|
||||
x(XOR_KEYSTREAM) \
|
||||
input += y; \
|
||||
break; \
|
||||
case XOR_KEYSTREAM_INPUT_ALIGNED: \
|
||||
x(XOR_KEYSTREAM_INPUT_ALIGNED) \
|
||||
input += y; \
|
||||
break; \
|
||||
case XOR_KEYSTREAM_OUTPUT_ALIGNED: \
|
||||
x(XOR_KEYSTREAM_OUTPUT_ALIGNED) \
|
||||
input += y; \
|
||||
break; \
|
||||
case WRITE_KEYSTREAM_ALIGNED: \
|
||||
x(WRITE_KEYSTREAM_ALIGNED) \
|
||||
break; \
|
||||
case XOR_KEYSTREAM_BOTH_ALIGNED: \
|
||||
x(XOR_KEYSTREAM_BOTH_ALIGNED) \
|
||||
input += y; \
|
||||
break; \
|
||||
} \
|
||||
output += y;
|
||||
|
||||
template <class BASE = AbstractPolicyHolder<AdditiveCipherAbstractPolicy, TwoBases<SymmetricCipher, RandomNumberGenerator> > >
|
||||
class CRYPTOPP_NO_VTABLE AdditiveCipherTemplate : public BASE
|
||||
{
|
||||
public:
|
||||
byte GenerateByte();
|
||||
void GenerateBlock(byte *output, size_t size);
|
||||
void ProcessData(byte *outString, const byte *inString, size_t length);
|
||||
void GetNextIV(byte *iv) {this->AccessPolicy().CipherGetNextIV(iv);}
|
||||
void Resynchronize(const byte *iv);
|
||||
unsigned int OptimalBlockSize() const {return this->GetPolicy().GetBytesPerIteration();}
|
||||
unsigned int OptimalBlockSize() const {return this->GetPolicy().GetOptimalBlockSize();}
|
||||
unsigned int GetOptimalNextBlockSize() const {return (unsigned int)this->m_leftOver;}
|
||||
unsigned int OptimalDataAlignment() const {return this->GetPolicy().GetAlignment();}
|
||||
bool IsSelfInverting() const {return true;}
|
||||
|
Loading…
Reference in New Issue
Block a user