diff --git a/blake2.cpp b/blake2.cpp index f66c8175..a807a19d 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -53,8 +53,8 @@ static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State struct CRYPTOPP_NO_VTABLE BLAKE2_IV {}; @@ -89,8 +89,8 @@ const word64 BLAKE2_IV::iv[8] = { #define BLAKE2B_IV(n) BLAKE2_IV::iv[n] -// IV and Sigma are a better fit as part of BLAKE2_Base, but that -// places the constants out of reach for the SSE2 and SSE4 implementations. +// IV and Sigma are a better fit as part of BLAKE2_Base, but that places +// the constants out of reach for the NEON, SSE2 and SSE4 implementations. template struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma {}; @@ -135,6 +135,9 @@ const byte BLAKE2_Sigma::sigma[12][16] = { { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; +// Reverse words for ARM (use arguments to _mm_set_epi32 without reversing them). +#define vld1q_s32_le(x, a,b,c,d) d[1]=c[0],d[2]=b[0],d[3]=a[0]; x = vld1q_s32(d); + // i-th word, not byte template inline W ReadWord(const BLAKE2_ParameterBlock& block, size_t i) @@ -3419,48 +3422,36 @@ static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State } #endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE -#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE -static inline int32x4_t VLD1Q_S32(int a, int b, int c, int d) -{ - CRYPTOPP_ALIGN_DATA(16) const int32_t data[4] = {d,c,b,a}; - return vld1q_s32(data); -} -#endif - #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State& state) { - int32x4_t row1,row2,row3,row4; - int32x4_t buf1,buf2,buf3,buf4; - int32x4_t ff0,ff1; - - const word32 m0 = ((const word32*)(const void*)input)[ 0]; - const word32 m1 = ((const word32*)(const void*)input)[ 1]; - const word32 m2 = ((const word32*)(const void*)input)[ 2]; - const word32 m3 = ((const word32*)(const void*)input)[ 3]; - const word32 m4 = ((const word32*)(const void*)input)[ 4]; - const word32 m5 = ((const word32*)(const void*)input)[ 5]; - const word32 m6 = ((const word32*)(const void*)input)[ 6]; - const word32 m7 = ((const word32*)(const void*)input)[ 7]; - const word32 m8 = ((const word32*)(const void*)input)[ 8]; - const word32 m9 = ((const word32*)(const void*)input)[ 9]; - const word32 m10 = ((const word32*)(const void*)input)[10]; - const word32 m11 = ((const word32*)(const void*)input)[11]; - const word32 m12 = ((const word32*)(const void*)input)[12]; - const word32 m13 = ((const word32*)(const void*)input)[13]; - const word32 m14 = ((const word32*)(const void*)input)[14]; - const word32 m15 = ((const word32*)(const void*)input)[15]; - assert(IsAlignedOn(&state.h[0],GetAlignmentOf())); assert(IsAlignedOn(&state.h[4],GetAlignmentOf())); assert(IsAlignedOn(&state.t[0],GetAlignmentOf())); + CRYPTOPP_ALIGN_DATA(BLAKE2_DALIGN) int32_t m0[4], m1[4], m2[4], m3[4]; + CRYPTOPP_ALIGN_DATA(BLAKE2_DALIGN) int32_t m4[4], m5[4], m6[4], m7[4]; + CRYPTOPP_ALIGN_DATA(BLAKE2_DALIGN) int32_t m8[4], m9[4], m10[4], m11[4]; + CRYPTOPP_ALIGN_DATA(BLAKE2_DALIGN) int32_t m12[4], m13[4], m14[4], m15[4]; + + CRYPTOPP_ALIGN_DATA(BLAKE2_DALIGN) const int32_t vv1[4] = {BLAKE2S_IV(0),BLAKE2S_IV(1),BLAKE2S_IV(2),BLAKE2S_IV(3)}; + CRYPTOPP_ALIGN_DATA(BLAKE2_DALIGN) const int32_t vv2[4] = {BLAKE2S_IV(4),BLAKE2S_IV(5),BLAKE2S_IV(6),BLAKE2S_IV(7)}; + + int32x4_t row1,row2,row3,row4; + int32x4_t buf1,buf2,buf3,buf4; + int32x4_t ff0,ff1; + + GetBlock get(input); + get(m0[0])(m1[0])(m2[0])(m3[0])(m4[0])(m5[0])(m6[0])(m7[0])(m8[0])(m9[0])(m10[0])(m11[0])(m12[0])(m13[0])(m14[0])(m15[0]); + row1 = ff0 = vld1q_s32((const int32_t*)&state.h[0]); row2 = ff1 = vld1q_s32((const int32_t*)&state.h[4]); - row3 = VLD1Q_S32(BLAKE2S_IV(3),BLAKE2S_IV(2),BLAKE2S_IV(1),BLAKE2S_IV(0)); - row4 = veorq_s32(VLD1Q_S32(BLAKE2S_IV(7),BLAKE2S_IV(6),BLAKE2S_IV(5),BLAKE2S_IV(4)), vld1q_s32(((const int32_t*)&state.t[0]))); + row3 = vld1q_s32(vv1); + row4 = veorq_s32(vld1q_s32(vv2), vld1q_s32(((const int32_t*)&state.t[0]))); + + // buf1 = vld1q_s32(m6,m4,m2,m0); + vld1q_s32_le(buf1, m6,m4,m2,m0); - buf1 = VLD1Q_S32(m6,m4,m2,m0); row1 = vaddq_s32(vaddq_s32(row1,buf1),row2); row4 = veorq_s32(row4,row1); row4 = veorq_s32((int32x4_t)vshrq_n_u32((uint32x4_t)row4,16),(int32x4_t)vshlq_n_s32((int32x4_t)row4,16)); @@ -3468,7 +3459,9 @@ static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State; template class BLAKE2_Base;