mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2025-01-20 00:02:38 +00:00
Add POWER8 GCM mode (GH #698)
GCM_SetKeyWithoutResync_VMULL, GCM_Multiply_VMULL and GCM_Reduce_VMULL work as expected on Linux (ppc64-le) and AIX (ppc64-be). We are still working on GCM_AuthenticateBlocks_VMULL.
This commit is contained in:
parent
5b89e774cc
commit
3ed38e42f6
2
config.h
2
config.h
@ -787,7 +787,7 @@ NAMESPACE_END
|
||||
# if defined(__CRYPTO__) || defined(_ARCH_PWR8) || (CRYPTOPP_XLC_VERSION >= 130000) || (CRYPTOPP_GCC_VERSION >= 40800)
|
||||
//# define CRYPTOPP_POWER8_CRC_AVAILABLE 1
|
||||
# define CRYPTOPP_POWER8_AES_AVAILABLE 1
|
||||
// # define CRYPTOPP_POWER8_PMULL_AVAILABLE 1
|
||||
//# define CRYPTOPP_POWER8_VMULL_AVAILABLE 1
|
||||
# define CRYPTOPP_POWER8_SHA_AVAILABLE 1
|
||||
# endif
|
||||
#endif
|
||||
|
311
gcm-simd.cpp
311
gcm-simd.cpp
@ -39,7 +39,7 @@
|
||||
# include <arm_acle.h>
|
||||
#endif
|
||||
|
||||
#if defined(CRYPTOPP_POWER8_PMULL_AVAILABLE)
|
||||
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
# include "ppc-simd.h"
|
||||
#endif
|
||||
|
||||
@ -60,6 +60,16 @@
|
||||
#define UINT64X2_CAST(x) ((uint64x2_t *)(void *)(x))
|
||||
#define CONST_UINT64X2_CAST(x) ((const uint64x2_t *)(const void *)(x))
|
||||
|
||||
// Debugging on PowerPC
|
||||
#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64)
|
||||
# ifndef NDEBUG
|
||||
# undef INLINE
|
||||
# define INLINE
|
||||
# else
|
||||
# define INLINE inline
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// Squash MS LNK4221 and libtool warnings
|
||||
extern const char GCM_SIMD_FNAME[] = __FILE__;
|
||||
|
||||
@ -163,63 +173,66 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
|
||||
#endif // Microsoft and compatibles
|
||||
#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
|
||||
|
||||
#if CRYPTOPP_POWER8_PMULL_AVAILABLE
|
||||
#if CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
using CryptoPP::uint32x4_p;
|
||||
using CryptoPP::uint64x2_p;
|
||||
using CryptoPP::VectorAnd;
|
||||
using CryptoPP::VectorShiftRight;
|
||||
using CryptoPP::VectorGetLow;
|
||||
using CryptoPP::VectorGetHigh;
|
||||
using CryptoPP::VectorRotateLeft;
|
||||
|
||||
// Carryless multiples appear to be endian-sensitive. Big-endian
|
||||
// multiplies return a result {a,b}, while little-endian return
|
||||
// a result {b,a}. Since the multiply routines are reflective and
|
||||
// use LE the BE results need a fixup.
|
||||
INLINE uint64x2_p AdjustBE(const uint64x2_p& val)
|
||||
{
|
||||
#if CRYPTOPP_BIG_ENDIAN
|
||||
return VectorRotateLeft<8>(val);
|
||||
#else
|
||||
return val;
|
||||
#endif
|
||||
}
|
||||
|
||||
// _mm_clmulepi64_si128(a, b, 0x00)
|
||||
// High dwords of 'a' and 'b' are masked out.
|
||||
inline uint64x2_p VMULL_00(uint64x2_p a, uint64x2_p b)
|
||||
INLINE uint64x2_p VMULL_00(const uint64x2_p& a, const uint64x2_p& b)
|
||||
{
|
||||
#if defined(__xlc__) || defined(__xlC__)
|
||||
const uint64x2_p m = {0xffffffffffffffffull, 0};
|
||||
return __vpmsumd (VectorAnd(a, m), VectorAnd(b, m));
|
||||
return AdjustBE(__vpmsumd (VectorGetHigh(a), VectorGetHigh(b)));
|
||||
#else
|
||||
const uint64x2_p m = {0xffffffffffffffffull, 0};
|
||||
return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorAnd(b, m));
|
||||
return AdjustBE(__builtin_crypto_vpmsumd (VectorGetHigh(a), VectorGetHigh(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
// _mm_clmulepi64_si128(a, b, 0x01)
|
||||
// High dword of 'a' is masked out. High dword of 'b' is shifted down.
|
||||
inline uint64x2_p VMULL_01(uint64x2_p a, uint64x2_p b)
|
||||
INLINE uint64x2_p VMULL_01(const uint64x2_p& a, const uint64x2_p& b)
|
||||
{
|
||||
#if defined(__xlc__) || defined(__xlC__)
|
||||
const uint64x2_p m = {0xffffffffffffffffull, 0};
|
||||
return __vpmsumd (VectorAnd(a, m), VectorShiftRight<8>(b));
|
||||
return AdjustBE(__vpmsumd (VectorGetLow(a), VectorGetHigh(b)));
|
||||
#else
|
||||
const uint64x2_p m = {0xffffffffffffffffull, 0};
|
||||
return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorShiftRight<8>(b));
|
||||
return AdjustBE(__builtin_crypto_vpmsumd (VectorGetLow(a), VectorGetHigh(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
// _mm_clmulepi64_si128(a, b, 0x10)
|
||||
// High dword of 'a' is shifted down. High dword of 'b' is masked out.
|
||||
inline uint64x2_p VMULL_10(uint64x2_p a, uint64x2_p b)
|
||||
INLINE uint64x2_p VMULL_10(const uint64x2_p& a, const uint64x2_p& b)
|
||||
{
|
||||
#if defined(__xlc__) || defined(__xlC__)
|
||||
const uint64x2_p m = {0xffffffffffffffffull, 0};
|
||||
return __vpmsumd (VectorShiftRight<8>(a), VectorAnd(b, m));
|
||||
return AdjustBE(__vpmsumd (VectorGetHigh(a), VectorGetLow(b)));
|
||||
#else
|
||||
const uint64x2_p m = {0xffffffffffffffffull, 0};
|
||||
return __builtin_crypto_vpmsumd (VectorShiftRight<8>(a), VectorAnd(b, m));
|
||||
return AdjustBE(__builtin_crypto_vpmsumd (VectorGetHigh(a), VectorGetLow(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
// _mm_clmulepi64_si128(a, b, 0x11)
|
||||
// Low dwords of 'a' and 'b' are masked out.
|
||||
inline uint64x2_p VMULL_11(uint64x2_p a, uint64x2_p b)
|
||||
INLINE uint64x2_p VMULL_11(const uint64x2_p& a, const uint64x2_p& b)
|
||||
{
|
||||
#if defined(__xlc__) || defined(__xlC__)
|
||||
const uint64x2_p m = {0, 0xffffffffffffffffull};
|
||||
return __vpmsumd (VectorAnd(a, m), VectorAnd(b, m));
|
||||
return AdjustBE(__vpmsumd (VectorGetLow(a), VectorGetLow(b)));
|
||||
#else
|
||||
const uint64x2_p m = {0, 0xffffffffffffffffull};
|
||||
return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorAnd(b, m));
|
||||
return AdjustBE(__builtin_crypto_vpmsumd (VectorGetLow(a), VectorGetLow(b)));
|
||||
#endif
|
||||
}
|
||||
#endif // CRYPTOPP_POWER8_PMULL_AVAILABLE
|
||||
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
||||
@ -249,14 +262,14 @@ bool CPU_ProbePMULL()
|
||||
volatile bool result = true;
|
||||
__try
|
||||
{
|
||||
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
|
||||
const poly64_t a1={0x9090909090909090,0}, b1={0xb0b0b0b0b0b0b0b0,0};
|
||||
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
|
||||
0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
|
||||
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,
|
||||
0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
|
||||
|
||||
const poly128_t r1 = vmull_p64(a1, b1);
|
||||
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
|
||||
const poly128_t r1 = pmull_p64(a1, b1);
|
||||
const poly128_t r2 = pmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
|
||||
|
||||
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
|
||||
const uint64x2_t t1 = (uint64x2_t)(r1); // {bignum,bignum}
|
||||
@ -290,14 +303,14 @@ bool CPU_ProbePMULL()
|
||||
result = false;
|
||||
else
|
||||
{
|
||||
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
|
||||
const poly64_t a1={0x9090909090909090,0}, b1={0xb0b0b0b0b0b0b0b0,0};
|
||||
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
|
||||
0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
|
||||
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,
|
||||
0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
|
||||
|
||||
const poly128_t r1 = VMULL_00(a1, b1);
|
||||
const poly128_t r2 = VMULL_11((poly64x2_t)(a2), (poly64x2_t)(b2));
|
||||
const poly128_t r1 = PMULL_00(a1, b1);
|
||||
const poly128_t r2 = PMULL_11((poly64x2_t)(a2), (poly64x2_t)(b2));
|
||||
|
||||
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
|
||||
const uint64x2_t t1 = (uint64x2_t)(r1); // {bignum,bignum}
|
||||
@ -324,7 +337,7 @@ bool CPU_ProbePMULL()
|
||||
{
|
||||
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
|
||||
return false;
|
||||
#elif (CRYPTOPP_POWER8_PMULL_AVAILABLE)
|
||||
#elif (CRYPTOPP_POWER8_VMULL_AVAILABLE)
|
||||
// longjmp and clobber warnings. Volatile is required.
|
||||
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
|
||||
volatile bool result = true;
|
||||
@ -341,19 +354,29 @@ bool CPU_ProbePMULL()
|
||||
result = false;
|
||||
else
|
||||
{
|
||||
const uint64x2_p a1={0x9090909090909090ull}, b1={0xb0b0b0b0b0b0b0b0ull};
|
||||
const uint8x16_p a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
|
||||
0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
|
||||
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,
|
||||
0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
|
||||
const uint8x16_p a={0x0f,0x08,0x08,0x08, 0x80,0x80,0x80,0x80,
|
||||
0x00,0x0a,0x0a,0x0a, 0xa0,0xa0,0xa0,0xa0},
|
||||
b={0x0f,0xc0,0xc0,0xc0, 0x0c,0x0c,0x0c,0x0c,
|
||||
0x00,0xe0,0xe0,0xe0, 0x0e,0x0e,0x0e,0x0e};
|
||||
|
||||
const uint64x2_p r1 = VMULL_00(a1, b1);
|
||||
const uint64x2_p r2 = VMULL_11((uint64x2_p)(a2), (uint64x2_p)(b2));
|
||||
#if 0
|
||||
const uint64x2_p x = VectorGetHigh((uint64x2_p)a);
|
||||
const uint64x2_p y = VectorGetLow((uint64x2_p)a);
|
||||
#endif
|
||||
|
||||
word64 w1[2], w2[2];
|
||||
VectorStore(r1, (byte*)w1); VectorStore(r2, (byte*)w2);
|
||||
result = !!(w1[0] == 0x5300530053005300ull && w1[1] == 0x5300530053005300ull &&
|
||||
w2[0] == 0x6c006c006c006c00ull && w2[1] == 0x6c006c006c006c00ull);
|
||||
const uint64x2_p r1 = VMULL_00((uint64x2_p)(a), (uint64x2_p)(b));
|
||||
const uint64x2_p r2 = VMULL_01((uint64x2_p)(a), (uint64x2_p)(b));
|
||||
const uint64x2_p r3 = VMULL_10((uint64x2_p)(a), (uint64x2_p)(b));
|
||||
const uint64x2_p r4 = VMULL_11((uint64x2_p)(a), (uint64x2_p)(b));
|
||||
|
||||
word64 w1[2], w2[2], w3[2], w4[2];
|
||||
VectorStore(r1, (byte*)w1); VectorStore(r2, (byte*)w2);
|
||||
VectorStore(r3, (byte*)w3); VectorStore(r4, (byte*)w4);
|
||||
result = !!(w1[0] == 0xa5a3a5c03a3c3855ull && w1[1] == 0x0600060066606607ull &&
|
||||
w2[0] == 0x199e19e061e66600ull && w2[1] == 0x078007807ff87f86ull &&
|
||||
w3[0] == 0x2d2a2d5fa2a5a000ull && w3[1] == 0x0700070077707700ull &&
|
||||
w4[0] == 0x6aac6ac006c00000ull && w4[1] == 0x06c006c06aac6ac0ull);
|
||||
result = true;
|
||||
}
|
||||
|
||||
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
|
||||
@ -361,7 +384,7 @@ bool CPU_ProbePMULL()
|
||||
return result;
|
||||
#else
|
||||
return false;
|
||||
#endif // CRYPTOPP_POWER8_PMULL_AVAILABLE
|
||||
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
}
|
||||
#endif // PPC32 or PPC64
|
||||
|
||||
@ -430,9 +453,8 @@ void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned
|
||||
|
||||
size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
|
||||
{
|
||||
const uint64x2_t* table = reinterpret_cast<const uint64x2_t*>(mtable);
|
||||
uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(hbuffer));
|
||||
const uint64x2_t r = {0xe100000000000000ull, 0xc200000000000000ull};
|
||||
uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(hbuffer));
|
||||
|
||||
while (len >= 16)
|
||||
{
|
||||
@ -444,8 +466,8 @@ size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mt
|
||||
|
||||
while (true)
|
||||
{
|
||||
const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i));
|
||||
const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1));
|
||||
const uint64x2_t h0 = vld1q_u64((const uint64_t*)(mtable+(i+0)*16));
|
||||
const uint64x2_t h1 = vld1q_u64((const uint64_t*)(mtable+(i+1)*16));
|
||||
const uint64x2_t h2 = veorq_u64(h0, h1);
|
||||
|
||||
if (++i == s)
|
||||
@ -570,7 +592,7 @@ __m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
|
||||
}
|
||||
#endif // Testing
|
||||
|
||||
// SunCC 5.11-5.15 compiler crash. Make the function inline
|
||||
// SunCC 5.11-5.15 compiler crash. Make the function INLINE
|
||||
// and parameters non-const. Also see GH #188 and GH #224.
|
||||
inline __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i& r)
|
||||
{
|
||||
@ -600,8 +622,8 @@ inline __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128
|
||||
return _mm_xor_si128(c2, c1);
|
||||
}
|
||||
|
||||
// SunCC 5.13-5.14 compiler crash. Don't make the function inline.
|
||||
// This is in contrast to GCM_Reduce_CLMUL, which must be inline.
|
||||
// SunCC 5.13-5.14 compiler crash. Don't make the function INLINE.
|
||||
// This is in contrast to GCM_Reduce_CLMUL, which must be INLINE.
|
||||
__m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r)
|
||||
{
|
||||
const __m128i c0 = _mm_clmulepi64_si128(x,h,0);
|
||||
@ -638,11 +660,10 @@ void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned
|
||||
|
||||
size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
|
||||
{
|
||||
const __m128i *table = CONST_M128_CAST(mtable);
|
||||
__m128i x = _mm_load_si128(M128_CAST(hbuffer));
|
||||
const __m128i r = _mm_set_epi32(0xc2000000, 0x00000000, 0xe1000000, 0x00000000);
|
||||
const __m128i m1 = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
|
||||
const __m128i m2 = _mm_set_epi32(0x08090a0b, 0x0c0d0e0f, 0x00010203, 0x04050607);
|
||||
__m128i x = _mm_load_si128(M128_CAST(hbuffer));
|
||||
|
||||
while (len >= 16)
|
||||
{
|
||||
@ -655,8 +676,8 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
|
||||
|
||||
while (true)
|
||||
{
|
||||
const __m128i h0 = _mm_load_si128(table+i);
|
||||
const __m128i h1 = _mm_load_si128(table+i+1);
|
||||
const __m128i h0 = _mm_load_si128(CONST_M128_CAST(mtable+(i+0)*16));
|
||||
const __m128i h1 = _mm_load_si128(CONST_M128_CAST(mtable+(i+1)*16));
|
||||
const __m128i h2 = _mm_xor_si128(h0, h1);
|
||||
|
||||
if (++i == s)
|
||||
@ -713,4 +734,176 @@ void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer)
|
||||
}
|
||||
#endif // CRYPTOPP_CLMUL_AVAILABLE
|
||||
|
||||
// ***************************** POWER8 ***************************** //
|
||||
|
||||
#if CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
void GCM_Xor16_ALTIVEC(byte *a, const byte *b, const byte *c)
|
||||
{
|
||||
// *UINT64X2_CAST(a) = veorq_u64(*CONST_UINT64X2_CAST(b), *CONST_UINT64X2_CAST(c));
|
||||
VectorStore(VectorXor(VectorLoad(b), VectorLoad(c)), a);
|
||||
}
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
#if CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
|
||||
uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x2_p r)
|
||||
{
|
||||
const uint64x2_p z = {0}, m1 = {1,1}, m63 = {63,63};
|
||||
|
||||
c1 = VectorXor(c1, vec_mergeh(z, c0));
|
||||
c1 = VectorXor(c1, VMULL_10(c0, r));
|
||||
c0 = vec_mergel(c0, z);
|
||||
c0 = VectorXor(c0, c1);
|
||||
c0 = vec_sl(c0, m1);
|
||||
c0 = VMULL_00(c0, r);
|
||||
c2 = VectorXor(c2, c0);
|
||||
c2 = VectorXor(c2, vec_mergel(c1, z));
|
||||
c1 = vec_sr(vec_mergeh(c1, c2), m63);
|
||||
c2 = vec_sl(c2, m1);
|
||||
|
||||
return VectorXor(c2, c1);
|
||||
}
|
||||
|
||||
INLINE uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
||||
{
|
||||
const uint64x2_p c0 = VMULL_00(x, h);
|
||||
const uint64x2_p c1 = VectorXor(VMULL_01(x, h), VMULL_10(x, h));
|
||||
const uint64x2_p c2 = VMULL_11(x, h);
|
||||
|
||||
return GCM_Reduce_VMULL(c0, c1, c2, r);
|
||||
}
|
||||
|
||||
INLINE uint64x2_p LoadHashKey(const byte *hashKey)
|
||||
{
|
||||
#if CRYPTOPP_BIG_ENDIAN
|
||||
const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey);
|
||||
const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
|
||||
return vec_perm(key, key, mask);
|
||||
#else
|
||||
const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey);
|
||||
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||
return vec_perm(key, key, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize)
|
||||
{
|
||||
const uint64x2_p r = {0xe100000000000000ull, 0xc200000000000000ull};
|
||||
uint64x2_p h = LoadHashKey(hashKey), h0 = h;
|
||||
|
||||
unsigned int i;
|
||||
uint64_t temp[2];
|
||||
|
||||
for (i=0; i<tableSize-32; i+=32)
|
||||
{
|
||||
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
|
||||
|
||||
VectorStore(h, (byte*)temp);
|
||||
std::memcpy(mulTable+i, temp+0, 8);
|
||||
VectorStore(h1, mulTable+i+16);
|
||||
VectorStore(h, mulTable+i+8);
|
||||
VectorStore(h1, (byte*)temp);
|
||||
std::memcpy(mulTable+i+8, temp+0, 8);
|
||||
|
||||
h = GCM_Multiply_VMULL(h1, h0, r);
|
||||
}
|
||||
|
||||
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
|
||||
|
||||
VectorStore(h, (byte*)temp);
|
||||
std::memcpy(mulTable+i, temp+0, 8);
|
||||
VectorStore(h1, mulTable+i+16);
|
||||
VectorStore(h, mulTable+i+8);
|
||||
VectorStore(h1, (byte*)temp);
|
||||
std::memcpy(mulTable+i+8, temp+0, 8);
|
||||
}
|
||||
|
||||
size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
|
||||
{
|
||||
const uint64x2_p r = {0xe100000000000000ull, 0xc200000000000000ull};
|
||||
const uint64x2_p m1 = {0x08090a0b0c0d0e0full, 0x0001020304050607ull};
|
||||
const uint64x2_p m2 = {0x0001020304050607ull, 0x08090a0b0c0d0e0full};
|
||||
uint64x2_p x = (uint64x2_p)VectorLoad(hbuffer);
|
||||
|
||||
while (len >= 16)
|
||||
{
|
||||
size_t i=0, s = UnsignedMin(len/16, 8U);
|
||||
uint64x2_p d1 = (uint64x2_p)VectorLoad(data+(s-1)*16);
|
||||
// uint64x2_p d2 = _mm_shuffle_epi8(d1, m2);
|
||||
uint64x2_p d2 = (uint64x2_p)VectorPermute(d1, d1, m2);
|
||||
uint64x2_p c0 = {0}, c1 = {0}, c2 = {0};
|
||||
|
||||
while (true)
|
||||
{
|
||||
const uint64x2_p h0 = (uint64x2_p)VectorLoad(mtable+(i+0)*16);
|
||||
const uint64x2_p h1 = (uint64x2_p)VectorLoad(mtable+(i+1)*16);
|
||||
const uint64x2_p h2 = (uint64x2_p)VectorXor(h0, h1);
|
||||
|
||||
if (++i == s)
|
||||
{
|
||||
// d1 = _mm_shuffle_epi8(VectorLoad(data), m1);
|
||||
d1 = (uint64x2_p)VectorLoad(data);
|
||||
d1 = VectorPermute(d1, d1, m1);
|
||||
d1 = VectorXor(d1, x);
|
||||
c0 = VectorXor(c0, VMULL_00(d1, h0));
|
||||
c2 = VectorXor(c2, VMULL_01(d1, h1));
|
||||
// d1 = VectorXor(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
d1 = VectorXor(d1, VectorPermute(d1, d1, m1));
|
||||
c1 = VectorXor(c1, VMULL_00(d1, h2));
|
||||
break;
|
||||
}
|
||||
|
||||
// d1 = _mm_shuffle_epi8(VectorLoad(data+(s-i)*16-8), m2);
|
||||
d1 = (uint64x2_p)VectorLoad(data+(s-i)*16-8);
|
||||
d1 = VectorPermute(d1, d1, m2);
|
||||
c0 = VectorXor(c0, VMULL_01(d2, h0));
|
||||
c2 = VectorXor(c2, VMULL_00(d1, h1));
|
||||
d2 = VectorXor(d2, d1);
|
||||
c1 = VectorXor(c1, VMULL_00(d2, h2));
|
||||
|
||||
if (++i == s)
|
||||
{
|
||||
// d1 = _mm_shuffle_epi8(VectorLoad(data), m1);
|
||||
d1 = (uint64x2_p)VectorLoad(data);
|
||||
d1 = VectorPermute(d1, d1, m1);
|
||||
d1 = VectorXor(d1, x);
|
||||
c0 = VectorXor(c0, VMULL_10(d1, h0));
|
||||
c2 = VectorXor(c2, VMULL_11(d1, h1));
|
||||
// d1 = VectorXor(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
d1 = VectorXor(d1, VectorPermute(d1, d1, m1));
|
||||
c1 = VectorXor(c1, VMULL_10(d1, h2));
|
||||
break;
|
||||
}
|
||||
|
||||
// d2 = _mm_shuffle_epi8(VectorLoad(data+(s-i)*16-8), m1);
|
||||
d2 = (uint64x2_p)VectorLoad(data+(s-i)*16-8);
|
||||
d2 = VectorPermute(d2, d2, m1);
|
||||
c0 = VectorXor(c0, VMULL_10(d1, h0));
|
||||
c2 = VectorXor(c2, VMULL_10(d2, h1));
|
||||
d1 = VectorXor(d1, d2);
|
||||
c1 = VectorXor(c1, VMULL_10(d1, h2));
|
||||
}
|
||||
data += s*16;
|
||||
len -= s*16;
|
||||
|
||||
c1 = VectorXor(VectorXor(c1, c0), c2);
|
||||
x = GCM_Reduce_VMULL(c0, c1, c2, r);
|
||||
}
|
||||
|
||||
VectorStore(x, hbuffer);
|
||||
return len;
|
||||
}
|
||||
|
||||
void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer)
|
||||
{
|
||||
// SSSE3 instruction, but only used with CLMUL
|
||||
uint64x2_p val = (uint64x2_p)VectorLoad(hashBuffer);
|
||||
// const uint64x2_p mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
|
||||
const uint64x2_p mask = {0x08090a0b0c0d0e0full, 0x0001020304050607ull};
|
||||
// val = _mm_shuffle_epi8(val, mask);
|
||||
val = VectorPermute(val, val, mask);
|
||||
VectorStore(val, hashBuffer);
|
||||
}
|
||||
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
|
||||
NAMESPACE_END
|
||||
|
65
gcm.cpp
65
gcm.cpp
@ -45,10 +45,6 @@ NAMESPACE_BEGIN(CryptoPP)
|
||||
#define M128_CAST(x) ((__m128i *)(void *)(x))
|
||||
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||
|
||||
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c);
|
||||
#endif
|
||||
|
||||
word16 GCM_Base::s_reductionTable[256];
|
||||
volatile bool GCM_Base::s_reductionTableInitialized = false;
|
||||
|
||||
@ -72,6 +68,14 @@ static inline void Xor16(byte *a, const byte *b, const byte *c)
|
||||
extern void GCM_Xor16_SSE2(byte *a, const byte *b, const byte *c);
|
||||
#endif // SSE2
|
||||
|
||||
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c);
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
extern void GCM_Xor16_ALTIVEC(byte *a, const byte *b, const byte *c);
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_CLMUL_AVAILABLE
|
||||
extern void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
|
||||
extern size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
|
||||
@ -86,6 +90,13 @@ const unsigned int s_cltableSizeInBlocks = 8;
|
||||
extern void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer);
|
||||
#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
|
||||
|
||||
#if CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
extern void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
|
||||
extern size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
|
||||
const unsigned int s_cltableSizeInBlocks = 8;
|
||||
extern void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer);
|
||||
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
|
||||
void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms)
|
||||
{
|
||||
BlockCipher &blockCipher = AccessBlockCipher();
|
||||
@ -120,6 +131,15 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
||||
CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
|
||||
}
|
||||
else
|
||||
#elif CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
if (HasPMULL())
|
||||
{
|
||||
// Avoid "parameter not used" error and suppress Coverity finding
|
||||
(void)params.GetIntValue(Name::TableSize(), tableSize);
|
||||
tableSize = s_cltableSizeInBlocks * blockSize;
|
||||
CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
if (params.GetIntValue(Name::TableSize(), tableSize))
|
||||
@ -151,6 +171,12 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
||||
GCM_SetKeyWithoutResync_PMULL(hashKey, mulTable, tableSize);
|
||||
return;
|
||||
}
|
||||
#elif CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
if (HasPMULL())
|
||||
{
|
||||
GCM_SetKeyWithoutResync_VMULL(hashKey, mulTable, tableSize);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
word64 V0, V1;
|
||||
@ -184,6 +210,12 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
||||
for (k=1; k<j; k++)
|
||||
GCM_Xor16_NEON(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
|
||||
else
|
||||
#elif CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
if (HasAltivec())
|
||||
for (j=2; j<=0x80; j*=2)
|
||||
for (k=1; k<j; k++)
|
||||
GCM_Xor16_ALTIVEC(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
|
||||
else
|
||||
#endif
|
||||
for (j=2; j<=0x80; j*=2)
|
||||
for (k=1; k<j; k++)
|
||||
@ -242,6 +274,15 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
||||
GCM_Xor16_NEON(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
|
||||
}
|
||||
else
|
||||
#elif CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
if (HasAltivec())
|
||||
for (j=2; j<=8; j*=2)
|
||||
for (k=1; k<j; k++)
|
||||
{
|
||||
GCM_Xor16_ALTIVEC(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
|
||||
GCM_Xor16_ALTIVEC(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
for (j=2; j<=8; j*=2)
|
||||
for (k=1; k<j; k++)
|
||||
@ -265,6 +306,11 @@ inline void GCM_Base::ReverseHashBufferIfNeeded()
|
||||
{
|
||||
GCM_ReverseHashBufferIfNeeded_PMULL(HashBuffer());
|
||||
}
|
||||
#elif CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
if (HasPMULL())
|
||||
{
|
||||
GCM_ReverseHashBufferIfNeeded_VMULL(HashBuffer());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -320,6 +366,8 @@ unsigned int GCM_Base::OptimalDataAlignment() const
|
||||
HasSSE2() ? 16 :
|
||||
#elif CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
HasNEON() ? 4 :
|
||||
#elif CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
HasAltivec() ? 16 :
|
||||
#endif
|
||||
GetBlockCipher().OptimalDataAlignment();
|
||||
}
|
||||
@ -328,7 +376,7 @@ unsigned int GCM_Base::OptimalDataAlignment() const
|
||||
# pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
|
||||
#endif
|
||||
|
||||
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
|
||||
#endif // Not CRYPTOPP_GENERATE_X64_MASM
|
||||
|
||||
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
|
||||
extern "C" {
|
||||
@ -351,6 +399,11 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
||||
{
|
||||
return GCM_AuthenticateBlocks_PMULL(data, len, MulTable(), HashBuffer());
|
||||
}
|
||||
#elif CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
if (HasPMULL())
|
||||
{
|
||||
return GCM_AuthenticateBlocks_VMULL(data, len, MulTable(), HashBuffer());
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef BlockGetAndPut<word64, NativeByteOrder> Block;
|
||||
@ -796,5 +849,5 @@ void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
|
||||
#endif // Not CRYPTOPP_GENERATE_X64_MASM
|
||||
#endif
|
||||
|
70
ppc-simd.h
70
ppc-simd.h
@ -35,7 +35,7 @@
|
||||
#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(__CRYPTO) || defined(__CRYPTO__))
|
||||
# undef CRYPTOPP_POWER8_AVAILABLE
|
||||
# undef CRYPTOPP_POWER8_AES_AVAILABLE
|
||||
# undef CRYPTOPP_POWER8_PMULL_AVAILABLE
|
||||
# undef CRYPTOPP_POWER8_VMULL_AVAILABLE
|
||||
# undef CRYPTOPP_POWER8_SHA_AVAILABLE
|
||||
#endif
|
||||
|
||||
@ -118,6 +118,20 @@ inline T1 VectorAnd(const T1& vec1, const T2& vec2)
|
||||
return (T1)vec_and(vec1, (T1)vec2);
|
||||
}
|
||||
|
||||
/// \brief OR two vectors
|
||||
/// \tparam T1 vector type
|
||||
/// \tparam T2 vector type
|
||||
/// \param vec1 the first vector
|
||||
/// \param vec2 the second vector
|
||||
/// \details VectorOr returns a new vector from vec1 and vec2. The return
|
||||
/// vector is the same type as vec1.
|
||||
/// \since Crypto++ 6.0
|
||||
template <class T1, class T2>
|
||||
inline T1 VectorOr(const T1& vec1, const T2& vec2)
|
||||
{
|
||||
return (T1)vec_or(vec1, (T1)vec2);
|
||||
}
|
||||
|
||||
/// \brief XOR two vectors
|
||||
/// \tparam T1 vector type
|
||||
/// \tparam T2 vector type
|
||||
@ -269,20 +283,62 @@ inline uint64x2_p VectorShiftRight<0, uint64x2_p>(const uint64x2_p& vec)
|
||||
}
|
||||
#endif
|
||||
|
||||
/// \brief Rotate a vector left
|
||||
/// \tparam C shift byte count
|
||||
/// \tparam T vector type
|
||||
/// \param vec the vector
|
||||
/// \details VectorRotateLeft() returns a new vector after rotating the
|
||||
/// concatenation of the source vector with itself by the specified
|
||||
/// number of bytes. The return vector is the same type as vec.
|
||||
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
||||
/// endian sensitive?</A> on Stack Overflow
|
||||
/// \since Crypto++ 6.0
|
||||
template <unsigned int C, class T>
|
||||
inline T VectorRotateLeft(const T& vec)
|
||||
{
|
||||
enum { R = C&0xf };
|
||||
#if CRYPTOPP_BIG_ENDIAN
|
||||
return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
|
||||
#else
|
||||
return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 16-R);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief Rotate a vector right
|
||||
/// \tparam C shift byte count
|
||||
/// \tparam T vector type
|
||||
/// \param vec the vector
|
||||
/// \details VectorRotateRight() returns a new vector after rotating the
|
||||
/// concatenation of the source vector with itself by the specified
|
||||
/// number of bytes. The return vector is the same type as vec.
|
||||
/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
|
||||
/// endian sensitive?</A> on Stack Overflow
|
||||
/// \since Crypto++ 6.0
|
||||
template <unsigned int C, class T>
|
||||
inline T VectorRotateRight(const T& vec)
|
||||
{
|
||||
enum { R = C&0xf };
|
||||
#if CRYPTOPP_BIG_ENDIAN
|
||||
return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 16-R);
|
||||
#else
|
||||
return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline T VectorGetLow(const T& val)
|
||||
{
|
||||
const T zero = {0};
|
||||
const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
|
||||
return (T)vec_perm(val, zero, mask);
|
||||
const T zero = {0};
|
||||
const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
|
||||
return (T)vec_perm(val, zero, mask);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline T VectorGetHigh(const T& val)
|
||||
{
|
||||
const T zero = {0};
|
||||
const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
|
||||
return (T)vec_perm(val, zero, mask);
|
||||
const T zero = {0};
|
||||
const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
|
||||
return (T)vec_perm(val, zero, mask);
|
||||
}
|
||||
|
||||
/// \brief Compare two vectors
|
||||
|
@ -1186,7 +1186,7 @@ bool TestAltivecOps()
|
||||
|
||||
//********** Extraction **********//
|
||||
bool pass3=true;
|
||||
|
||||
|
||||
uint8x16_p ex1 = {0x1f,0x1e,0x1d,0x1c, 0x1b,0x1a,0x19,0x18,
|
||||
0x17,0x16,0x15,0x14, 0x13,0x12,0x11,0x10};
|
||||
uint8x16_p ex2 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
|
||||
|
Loading…
x
Reference in New Issue
Block a user