Add Intel and ARM intrinsics

Win32 and Win64 benefited from the Intel intrinsics. A32 and Aarch64 benefited from the ARM intrinsics. The intrinsics shaved 150 to 350 cycles from key setup.

The intrinsics slowed modern GCC down a small bit, and did not appear to affect old GCC. As such, Intel intrinsics were only enabled for Microsoft compilers.

We were not able to improve encryption and decryption. In fact, some of the attempted macro conversions and intrinsics attempts slowed things down considerably. For example, GCC 5.4 on x86_64 went from 120 MB/s to about 70 MB/s when we tried to improve code around the Key XOR Layer (ARIA_KXL).
This commit is contained in:
Jeffrey Walton 2017-04-12 23:28:41 -04:00
parent f44e705c16
commit 59767be52e
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 196 additions and 80 deletions

268
aria.cpp
View File

@ -14,7 +14,11 @@
#include "misc.h"
#include "cpu.h"
#include <stdio.h>
// Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150
// to 200 cycles. GCC does fine on its own, and it slows things down a small bit.
#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER
# define CRYPTOPP_ENABLE_ARIA_INTRINSICS 1
#endif
ANONYMOUS_NAMESPACE_BEGIN
@ -313,46 +317,148 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
// w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t is a 16 byte temp area.
word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20;
w0[0] = LoadWord<true>(mk,0); w0[1] = LoadWord<true>(mk,1);
w0[2] = LoadWord<true>(mk,2); w0[3] = LoadWord<true>(mk,3);
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
if (HasSSSE3())
{
// 7 SSE instructions
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
const __m128i w = _mm_shuffle_epi8(_mm_load_si128((const __m128i*)mk), m);
_mm_store_si128((__m128i*)w0, w);
t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1];
t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3];
_mm_store_si128((__m128i*)t, _mm_xor_si128(w,
_mm_load_si128((const __m128i*)KRK[q])));
}
else
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
// 27 integer instructions
w0[0] = LoadWord<true>(mk,0); w0[1] = LoadWord<true>(mk,1);
w0[2] = LoadWord<true>(mk,2); w0[3] = LoadWord<true>(mk,3);
t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1];
t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3];
}
// 24 integer instructions
ARIA_FO;
if (keyBits > 128)
if (keyBits == 256)
{
w1[0] = LoadWord<true>(mk,4);
w1[1] = LoadWord<true>(mk,5);
if (keyBits > 192)
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
if (HasSSSE3())
{
// 3 SSE instructions
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
_mm_store_si128(reinterpret_cast<__m128i*>(w1),
_mm_shuffle_epi8(_mm_load_si128((const __m128i*)(mk+16)), m));
}
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
// 14 integer instructions
w1[0] = LoadWord<true>(mk,4);
w1[1] = LoadWord<true>(mk,5);
w1[2] = LoadWord<true>(mk,6);
w1[3] = LoadWord<true>(mk,7);
}
else
{
w1[2]=w1[3]=0;
}
}
else if (keyBits == 192)
{
w1[0] = LoadWord<true>(mk,4);
w1[1] = LoadWord<true>(mk,5);
w1[2] = w1[3] = 0;
}
else
{
w1[0]=w1[1]=w1[2]=w1[3]=0;
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
if (HasSSE2())
{
_mm_store_si128(reinterpret_cast<__m128i*>(w1), _mm_setzero_si128());
}
else
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
w1[0]=w1[1]=w1[2]=w1[3]=0;
}
}
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
if (HasSSSE3())
{
// 4 integer, 7 SSE instructions
const __m128i x = _mm_xor_si128(
_mm_load_si128((const __m128i*)(w1)),
_mm_load_si128((const __m128i*)(t)));
_mm_store_si128((__m128i*)(w1), x);
_mm_store_si128((__m128i*)(t), x);
q = (q==2) ? 0 : (q+1);
const __m128i y = _mm_xor_si128(
_mm_load_si128((const __m128i*)(t)),
_mm_load_si128((const __m128i*)(KRK[q])));
_mm_store_si128((__m128i*)(t), y);
}
else
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
// 23 integer instructions
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
}
q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
ARIA_FE;
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
if (HasSSSE3())
{
// 4 integer, 7 SSE instructions
const __m128i x = _mm_xor_si128(
_mm_load_si128((const __m128i*)(w0)),
_mm_load_si128((const __m128i*)(t)));
_mm_store_si128((__m128i*)(w2), x);
_mm_store_si128((__m128i*)(t), x);
q = (q==2) ? 0 : (q+1);
const __m128i y = _mm_xor_si128(
_mm_load_si128((const __m128i*)(t)),
_mm_load_si128((const __m128i*)(KRK[q])));
_mm_store_si128((__m128i*)(t), y);
}
else
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
// 23 integer instructions
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
}
ARIA_FO;
w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
if (HasSSSE3())
{
// 3 SSE instructions
const __m128i x = _mm_xor_si128(
_mm_load_si128((const __m128i*)(w1)),
_mm_load_si128((const __m128i*)(t)));
_mm_store_si128((__m128i*)(w3), x);
}
else
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
// 14 integer instructions
w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
}
#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
if (HasNEON())
@ -384,7 +490,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
}
}
else
#endif
#endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
{
ARIA_GSRK<19>(w0, w1, rk + 0);
ARIA_GSRK<19>(w1, w2, rk + 16);
@ -423,6 +529,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
rk = m_rk.data();
r = R; q = Q;
// 32 integer intructions
a=reinterpret_cast<word32*>(rk); z=a+r*4;
t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3];
a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3];
@ -432,89 +539,92 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
for (; a<z; a+=4, z-=4)
{
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
s0=t[0]; s1=t[1]; s2=t[2]; s3=t[3];
ARIA_M1(z[0],t[0]); ARIA_M1(z[1],t[1]); ARIA_M1(z[2],t[2]); ARIA_M1(z[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
a[0]=t[0]; a[1]=t[1]; a[2]=t[2]; a[3]=t[3];
z[0]=s0; z[1]=s1; z[2]=s2; z[3]=s3;
}
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
}
}
void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
const byte *i=inBlock, *x=xorBlock;
byte *o = outBlock;
const byte *rk = reinterpret_cast<const byte*>(m_rk.data());
word32 *t = const_cast<word32*>(m_w.data()+20);
t[0] = LoadWord<true>(i,0); t[1] = LoadWord<true>(i,1);
t[2] = LoadWord<true>(i,2); t[3] = LoadWord<true>(i,3);
// Visual Studio is generating bad code within the SSSE3 code block. It is
// providing a NULL pointer or a pointer set to a constant like 0x1000.
// It looks like some leftover garbage in the XMM register rather than
// the pointer loaded into the integer register for the non-SSE code path.
t[0] = LoadWord<true>(inBlock,0); t[1] = LoadWord<true>(inBlock,1);
t[2] = LoadWord<true>(inBlock,2); t[3] = LoadWord<true>(inBlock,3);
if (m_rounds > 12) {
ARIA_KXL rk+= 16; ARIA_FO
ARIA_KXL rk+= 16; ARIA_FE
ARIA_KXL; rk+= 16; ARIA_FO;
ARIA_KXL; rk+= 16; ARIA_FE;
}
if (m_rounds > 14) {
ARIA_KXL rk+= 16; ARIA_FO
ARIA_KXL rk+= 16; ARIA_FE
ARIA_KXL; rk+= 16; ARIA_FO;
ARIA_KXL; rk+= 16; ARIA_FE;
}
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE
ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16;
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE;
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16;
#ifdef IS_LITTLE_ENDIAN
o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3];
o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2];
o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1];
o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0];
o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7];
o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6];
o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5];
o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4];
o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11];
o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10];
o[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9];
o[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8];
o[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15];
o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14];
o[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13];
o[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12];
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3];
outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2];
outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1];
outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0];
outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7];
outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6];
outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5];
outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4];
outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11];
outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10];
outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9];
outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8];
outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15];
outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14];
outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13];
outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12];
#else
#define ARIA_WORD(X,Y) (((word32 *)(X))[Y])
o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
o[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
o[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
o[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
o[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
o[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
ARIA_WORD(o,0)^=LoadWord<true>(rk,0); ARIA_WORD(o,1)^=LoadWord<true>(rk,1);
ARIA_WORD(o,2)^=LoadWord<true>(rk,2); ARIA_WORD(o,3)^=LoadWord<true>(rk,3);
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
ARIA_WORD(outBlock,0)^=LoadWord<true>(rk,0);
ARIA_WORD(outBlock,1)^=LoadWord<true>(rk,1);
ARIA_WORD(outBlock,2)^=LoadWord<true>(rk,2);
ARIA_WORD(outBlock,3)^=LoadWord<true>(rk,3);
#endif
if (x)
if (xorBlock)
for (unsigned int n=0; n<16; ++n)
o[n] ^= x[n];
outBlock[n] ^= xorBlock[n];
}
NAMESPACE_END

View File

@ -402,7 +402,7 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
#endif
#if !defined(CRYPTOPP_DISABLE_SSE3) && (_MSC_VER >= 1500 || (defined(__SSE3__) && defined(__SSSE3__)))
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSSE3__) && defined(__SSSE3__)))
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
@ -423,6 +423,12 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
#endif
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || defined(__GNUC__) || (defined(__SSSE3__) && defined(__SSSE3__)))
#define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 0
#endif
// Intrinsics availible in GCC 4.3 (http://gcc.gnu.org/gcc-4.3/changes.html) and
// MSVC 2008 (http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx)
// SunCC could generate SSE4 at 12.1, but the intrinsics are missing until 12.4.