mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-23 09:59:42 +00:00
Add Intel and ARM intrinsics
Win32 and Win64 benefited from the Intel intrinsics. A32 and Aarch64 benefited from the ARM intrinsics. The intrinsics shaved 150 to 350 cycles from key setup. The intrinsics slowed modern GCC down a small bit, and did not appear to affect old GCC. As such, Intel intrinsics were only enabled for Microsoft compilers. We were not able to improve encryption and decryption. In fact, some of the attempted macro conversions and intrinsics attempts slowed things down considerably. For example, GCC 5.4 on x86_64 went from 120 MB/s to about 70 MB/s when we tried to improve code around the Key XOR Layer (ARIA_KXL).
This commit is contained in:
parent
f44e705c16
commit
59767be52e
268
aria.cpp
268
aria.cpp
@ -14,7 +14,11 @@
|
||||
#include "misc.h"
|
||||
#include "cpu.h"
|
||||
|
||||
#include <stdio.h>
|
||||
// Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150
|
||||
// to 200 cycles. GCC does fine on its own, and it slows things down a small bit.
|
||||
#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER
|
||||
# define CRYPTOPP_ENABLE_ARIA_INTRINSICS 1
|
||||
#endif
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
@ -313,46 +317,148 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||
// w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t is a 16 byte temp area.
|
||||
word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20;
|
||||
|
||||
w0[0] = LoadWord<true>(mk,0); w0[1] = LoadWord<true>(mk,1);
|
||||
w0[2] = LoadWord<true>(mk,2); w0[3] = LoadWord<true>(mk,3);
|
||||
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
if (HasSSSE3())
|
||||
{
|
||||
// 7 SSE instructions
|
||||
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
|
||||
const __m128i w = _mm_shuffle_epi8(_mm_load_si128((const __m128i*)mk), m);
|
||||
_mm_store_si128((__m128i*)w0, w);
|
||||
|
||||
t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1];
|
||||
t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3];
|
||||
_mm_store_si128((__m128i*)t, _mm_xor_si128(w,
|
||||
_mm_load_si128((const __m128i*)KRK[q])));
|
||||
}
|
||||
else
|
||||
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
{
|
||||
// 27 integer instructions
|
||||
w0[0] = LoadWord<true>(mk,0); w0[1] = LoadWord<true>(mk,1);
|
||||
w0[2] = LoadWord<true>(mk,2); w0[3] = LoadWord<true>(mk,3);
|
||||
|
||||
t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1];
|
||||
t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3];
|
||||
}
|
||||
|
||||
// 24 integer instructions
|
||||
ARIA_FO;
|
||||
|
||||
if (keyBits > 128)
|
||||
if (keyBits == 256)
|
||||
{
|
||||
w1[0] = LoadWord<true>(mk,4);
|
||||
w1[1] = LoadWord<true>(mk,5);
|
||||
|
||||
if (keyBits > 192)
|
||||
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
if (HasSSSE3())
|
||||
{
|
||||
// 3 SSE instructions
|
||||
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(w1),
|
||||
_mm_shuffle_epi8(_mm_load_si128((const __m128i*)(mk+16)), m));
|
||||
}
|
||||
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
{
|
||||
// 14 integer instructions
|
||||
w1[0] = LoadWord<true>(mk,4);
|
||||
w1[1] = LoadWord<true>(mk,5);
|
||||
w1[2] = LoadWord<true>(mk,6);
|
||||
w1[3] = LoadWord<true>(mk,7);
|
||||
}
|
||||
else
|
||||
{
|
||||
w1[2]=w1[3]=0;
|
||||
}
|
||||
}
|
||||
else if (keyBits == 192)
|
||||
{
|
||||
w1[0] = LoadWord<true>(mk,4);
|
||||
w1[1] = LoadWord<true>(mk,5);
|
||||
w1[2] = w1[3] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
w1[0]=w1[1]=w1[2]=w1[3]=0;
|
||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
if (HasSSE2())
|
||||
{
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(w1), _mm_setzero_si128());
|
||||
}
|
||||
else
|
||||
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
{
|
||||
w1[0]=w1[1]=w1[2]=w1[3]=0;
|
||||
}
|
||||
}
|
||||
|
||||
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
|
||||
t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
|
||||
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
if (HasSSSE3())
|
||||
{
|
||||
// 4 integer, 7 SSE instructions
|
||||
const __m128i x = _mm_xor_si128(
|
||||
_mm_load_si128((const __m128i*)(w1)),
|
||||
_mm_load_si128((const __m128i*)(t)));
|
||||
|
||||
_mm_store_si128((__m128i*)(w1), x);
|
||||
_mm_store_si128((__m128i*)(t), x);
|
||||
|
||||
q = (q==2) ? 0 : (q+1);
|
||||
const __m128i y = _mm_xor_si128(
|
||||
_mm_load_si128((const __m128i*)(t)),
|
||||
_mm_load_si128((const __m128i*)(KRK[q])));
|
||||
|
||||
_mm_store_si128((__m128i*)(t), y);
|
||||
}
|
||||
else
|
||||
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
{
|
||||
// 23 integer instructions
|
||||
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
|
||||
t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
|
||||
|
||||
q = (q==2) ? 0 : (q+1);
|
||||
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
|
||||
}
|
||||
|
||||
q = (q==2) ? 0 : (q+1);
|
||||
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
|
||||
ARIA_FE;
|
||||
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
|
||||
w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
|
||||
|
||||
q = (q==2) ? 0 : (q+1);
|
||||
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
|
||||
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
if (HasSSSE3())
|
||||
{
|
||||
// 4 integer, 7 SSE instructions
|
||||
const __m128i x = _mm_xor_si128(
|
||||
_mm_load_si128((const __m128i*)(w0)),
|
||||
_mm_load_si128((const __m128i*)(t)));
|
||||
|
||||
_mm_store_si128((__m128i*)(w2), x);
|
||||
_mm_store_si128((__m128i*)(t), x);
|
||||
|
||||
q = (q==2) ? 0 : (q+1);
|
||||
const __m128i y = _mm_xor_si128(
|
||||
_mm_load_si128((const __m128i*)(t)),
|
||||
_mm_load_si128((const __m128i*)(KRK[q])));
|
||||
|
||||
_mm_store_si128((__m128i*)(t), y);
|
||||
}
|
||||
else
|
||||
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
{
|
||||
// 23 integer instructions
|
||||
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
|
||||
w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
|
||||
|
||||
q = (q==2) ? 0 : (q+1);
|
||||
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
|
||||
}
|
||||
|
||||
ARIA_FO;
|
||||
w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
|
||||
|
||||
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
if (HasSSSE3())
|
||||
{
|
||||
// 3 SSE instructions
|
||||
const __m128i x = _mm_xor_si128(
|
||||
_mm_load_si128((const __m128i*)(w1)),
|
||||
_mm_load_si128((const __m128i*)(t)));
|
||||
|
||||
_mm_store_si128((__m128i*)(w3), x);
|
||||
}
|
||||
else
|
||||
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||
{
|
||||
// 14 integer instructions
|
||||
w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
|
||||
}
|
||||
|
||||
#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
|
||||
if (HasNEON())
|
||||
@ -384,7 +490,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
|
||||
{
|
||||
ARIA_GSRK<19>(w0, w1, rk + 0);
|
||||
ARIA_GSRK<19>(w1, w2, rk + 16);
|
||||
@ -423,6 +529,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||
rk = m_rk.data();
|
||||
r = R; q = Q;
|
||||
|
||||
// 32 integer intructions
|
||||
a=reinterpret_cast<word32*>(rk); z=a+r*4;
|
||||
t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3];
|
||||
a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3];
|
||||
@ -432,89 +539,92 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||
for (; a<z; a+=4, z-=4)
|
||||
{
|
||||
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
|
||||
ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
|
||||
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
||||
s0=t[0]; s1=t[1]; s2=t[2]; s3=t[3];
|
||||
ARIA_M1(z[0],t[0]); ARIA_M1(z[1],t[1]); ARIA_M1(z[2],t[2]); ARIA_M1(z[3],t[3]);
|
||||
ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
|
||||
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
||||
a[0]=t[0]; a[1]=t[1]; a[2]=t[2]; a[3]=t[3];
|
||||
z[0]=s0; z[1]=s1; z[2]=s2; z[3]=s3;
|
||||
}
|
||||
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
|
||||
ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
|
||||
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
||||
z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
|
||||
}
|
||||
}
|
||||
|
||||
void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||
{
|
||||
const byte *i=inBlock, *x=xorBlock;
|
||||
byte *o = outBlock;
|
||||
|
||||
const byte *rk = reinterpret_cast<const byte*>(m_rk.data());
|
||||
word32 *t = const_cast<word32*>(m_w.data()+20);
|
||||
|
||||
t[0] = LoadWord<true>(i,0); t[1] = LoadWord<true>(i,1);
|
||||
t[2] = LoadWord<true>(i,2); t[3] = LoadWord<true>(i,3);
|
||||
// Visual Studio is generating bad code within the SSSE3 code block. It is
|
||||
// providing a NULL pointer or a pointer set to a constant like 0x1000.
|
||||
// It looks like some leftover garbage in the XMM register rather than
|
||||
// the pointer loaded into the integer register for the non-SSE code path.
|
||||
t[0] = LoadWord<true>(inBlock,0); t[1] = LoadWord<true>(inBlock,1);
|
||||
t[2] = LoadWord<true>(inBlock,2); t[3] = LoadWord<true>(inBlock,3);
|
||||
|
||||
if (m_rounds > 12) {
|
||||
ARIA_KXL rk+= 16; ARIA_FO
|
||||
ARIA_KXL rk+= 16; ARIA_FE
|
||||
ARIA_KXL; rk+= 16; ARIA_FO;
|
||||
ARIA_KXL; rk+= 16; ARIA_FE;
|
||||
}
|
||||
|
||||
if (m_rounds > 14) {
|
||||
ARIA_KXL rk+= 16; ARIA_FO
|
||||
ARIA_KXL rk+= 16; ARIA_FE
|
||||
ARIA_KXL; rk+= 16; ARIA_FO;
|
||||
ARIA_KXL; rk+= 16; ARIA_FE;
|
||||
}
|
||||
|
||||
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
|
||||
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
|
||||
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
|
||||
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
|
||||
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
|
||||
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16;
|
||||
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
|
||||
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
|
||||
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
|
||||
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
|
||||
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
|
||||
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16;
|
||||
|
||||
#ifdef IS_LITTLE_ENDIAN
|
||||
o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3];
|
||||
o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2];
|
||||
o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1];
|
||||
o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0];
|
||||
o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7];
|
||||
o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6];
|
||||
o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5];
|
||||
o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4];
|
||||
o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11];
|
||||
o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10];
|
||||
o[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9];
|
||||
o[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8];
|
||||
o[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15];
|
||||
o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14];
|
||||
o[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13];
|
||||
o[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12];
|
||||
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3];
|
||||
outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2];
|
||||
outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1];
|
||||
outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0];
|
||||
outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7];
|
||||
outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6];
|
||||
outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5];
|
||||
outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4];
|
||||
outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11];
|
||||
outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10];
|
||||
outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9];
|
||||
outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8];
|
||||
outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15];
|
||||
outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14];
|
||||
outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13];
|
||||
outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12];
|
||||
#else
|
||||
#define ARIA_WORD(X,Y) (((word32 *)(X))[Y])
|
||||
o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
|
||||
o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
|
||||
o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
|
||||
o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
|
||||
o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
|
||||
o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
|
||||
o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
|
||||
o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
|
||||
o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
|
||||
o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
|
||||
o[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
|
||||
o[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
|
||||
o[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
|
||||
o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
|
||||
o[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
|
||||
o[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
|
||||
ARIA_WORD(o,0)^=LoadWord<true>(rk,0); ARIA_WORD(o,1)^=LoadWord<true>(rk,1);
|
||||
ARIA_WORD(o,2)^=LoadWord<true>(rk,2); ARIA_WORD(o,3)^=LoadWord<true>(rk,3);
|
||||
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
|
||||
outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
|
||||
outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
|
||||
outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
|
||||
outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
|
||||
outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
|
||||
outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
|
||||
outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
|
||||
outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
|
||||
outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
|
||||
outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
|
||||
outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
|
||||
outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
|
||||
outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
|
||||
outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
|
||||
outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
|
||||
ARIA_WORD(outBlock,0)^=LoadWord<true>(rk,0);
|
||||
ARIA_WORD(outBlock,1)^=LoadWord<true>(rk,1);
|
||||
ARIA_WORD(outBlock,2)^=LoadWord<true>(rk,2);
|
||||
ARIA_WORD(outBlock,3)^=LoadWord<true>(rk,3);
|
||||
#endif
|
||||
|
||||
if (x)
|
||||
if (xorBlock)
|
||||
for (unsigned int n=0; n<16; ++n)
|
||||
o[n] ^= x[n];
|
||||
outBlock[n] ^= xorBlock[n];
|
||||
}
|
||||
|
||||
NAMESPACE_END
|
||||
|
8
config.h
8
config.h
@ -402,7 +402,7 @@ NAMESPACE_END
|
||||
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
|
||||
#endif
|
||||
|
||||
#if !defined(CRYPTOPP_DISABLE_SSE3) && (_MSC_VER >= 1500 || (defined(__SSE3__) && defined(__SSSE3__)))
|
||||
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSSE3__) && defined(__SSSE3__)))
|
||||
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1
|
||||
#else
|
||||
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
|
||||
@ -423,6 +423,12 @@ NAMESPACE_END
|
||||
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
|
||||
#endif
|
||||
|
||||
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || defined(__GNUC__) || (defined(__SSSE3__) && defined(__SSSE3__)))
|
||||
#define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 1
|
||||
#else
|
||||
#define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 0
|
||||
#endif
|
||||
|
||||
// Intrinsics availible in GCC 4.3 (http://gcc.gnu.org/gcc-4.3/changes.html) and
|
||||
// MSVC 2008 (http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx)
|
||||
// SunCC could generate SSE4 at 12.1, but the intrinsics are missing until 12.4.
|
||||
|
Loading…
Reference in New Issue
Block a user