diff --git a/blake2b-simd.cpp b/blake2b-simd.cpp index c95b46ac..7bc500a3 100644 --- a/blake2b-simd.cpp +++ b/blake2b-simd.cpp @@ -53,7 +53,7 @@ extern const word64 BLAKE2B_IV[8]; #if CRYPTOPP_SSE41_AVAILABLE -#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void*)(p) ) +#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p)) #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r) #define TOF(reg) _mm_castsi128_ps((reg)) #define TOI(reg) _mm_castps_si128((reg)) @@ -439,48 +439,48 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2b_State& state) __m128i b0, b1; __m128i t0, t1; - const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); - const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); + const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - const __m128i m0 = LOADU( input + 00 ); - const __m128i m1 = LOADU( input + 16 ); - const __m128i m2 = LOADU( input + 32 ); - const __m128i m3 = LOADU( input + 48 ); - const __m128i m4 = LOADU( input + 64 ); - const __m128i m5 = LOADU( input + 80 ); - const __m128i m6 = LOADU( input + 96 ); - const __m128i m7 = LOADU( input + 112 ); + const __m128i m0 = LOADU(input + 00); + const __m128i m1 = LOADU(input + 16); + const __m128i m2 = LOADU(input + 32); + const __m128i m3 = LOADU(input + 48); + const __m128i m4 = LOADU(input + 64); + const __m128i m5 = LOADU(input + 80); + const __m128i m6 = LOADU(input + 96); + const __m128i m7 = LOADU(input + 112); - row1l = LOADU( &state.h[0] ); - row1h = LOADU( &state.h[2] ); - row2l = LOADU( &state.h[4] ); - row2h = LOADU( &state.h[6] ); - row3l = LOADU( &BLAKE2B_IV[0] ); - row3h = LOADU( &BLAKE2B_IV[2] ); - row4l = _mm_xor_si128( LOADU( &BLAKE2B_IV[4] ), LOADU( &state.tf[0] ) ); - row4h = _mm_xor_si128( LOADU( &BLAKE2B_IV[6] ), LOADU( &state.tf[2] ) ); + row1l = LOADU(&state.h[0]); + row1h = LOADU(&state.h[2]); + row2l = LOADU(&state.h[4]); + row2h = LOADU(&state.h[6]); + row3l = LOADU(&BLAKE2B_IV[0]); + row3h = LOADU(&BLAKE2B_IV[2]); + row4l = _mm_xor_si128(LOADU(&BLAKE2B_IV[4]), LOADU(&state.tf[0])); + row4h = _mm_xor_si128(LOADU(&BLAKE2B_IV[6]), LOADU(&state.tf[2])); - BLAKE2B_ROUND( 0 ); - BLAKE2B_ROUND( 1 ); - BLAKE2B_ROUND( 2 ); - BLAKE2B_ROUND( 3 ); - BLAKE2B_ROUND( 4 ); - BLAKE2B_ROUND( 5 ); - BLAKE2B_ROUND( 6 ); - BLAKE2B_ROUND( 7 ); - BLAKE2B_ROUND( 8 ); - BLAKE2B_ROUND( 9 ); - BLAKE2B_ROUND( 10 ); - BLAKE2B_ROUND( 11 ); + BLAKE2B_ROUND(0); + BLAKE2B_ROUND(1); + BLAKE2B_ROUND(2); + BLAKE2B_ROUND(3); + BLAKE2B_ROUND(4); + BLAKE2B_ROUND(5); + BLAKE2B_ROUND(6); + BLAKE2B_ROUND(7); + BLAKE2B_ROUND(8); + BLAKE2B_ROUND(9); + BLAKE2B_ROUND(10); + BLAKE2B_ROUND(11); - row1l = _mm_xor_si128( row3l, row1l ); - row1h = _mm_xor_si128( row3h, row1h ); - STOREU( &state.h[0], _mm_xor_si128( LOADU( &state.h[0] ), row1l ) ); - STOREU( &state.h[2], _mm_xor_si128( LOADU( &state.h[2] ), row1h ) ); - row2l = _mm_xor_si128( row4l, row2l ); - row2h = _mm_xor_si128( row4h, row2h ); - STOREU( &state.h[4], _mm_xor_si128( LOADU( &state.h[4] ), row2l ) ); - STOREU( &state.h[6], _mm_xor_si128( LOADU( &state.h[6] ), row2h ) ); + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&state.h[0], _mm_xor_si128(LOADU(&state.h[0]), row1l)); + STOREU(&state.h[2], _mm_xor_si128(LOADU(&state.h[2]), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&state.h[4], _mm_xor_si128(LOADU(&state.h[4]), row2l)); + STOREU(&state.h[6], _mm_xor_si128(LOADU(&state.h[6]), row2h)); } #endif // CRYPTOPP_SSE41_AVAILABLE @@ -633,11 +633,11 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state) #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x)))) - #define vrorq_n_u64_24(x) vcombine_u64(\ + #define vrorq_n_u64_24(x) vcombine_u64( \ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3))) - #define vrorq_n_u64_16(x) vcombine_u64(\ + #define vrorq_n_u64_16(x) vcombine_u64( \ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2))) diff --git a/blake2s-simd.cpp b/blake2s-simd.cpp index e2c000ad..37baf5a1 100644 --- a/blake2s-simd.cpp +++ b/blake2s-simd.cpp @@ -53,7 +53,7 @@ extern const word64 BLAKE2B_IV[8]; #if CRYPTOPP_SSE41_AVAILABLE -#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void*)(p) ) +#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p)) #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r) #define TOF(reg) _mm_castsi128_ps((reg)) #define TOI(reg) _mm_castps_si128((reg)) @@ -278,35 +278,35 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2s_State& state) # define MM_ROTI_EPI32(r, c) ( \ (8==-(c)) ? _mm_shuffle_epi8(r,r8) \ : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \ - : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ), \ - _mm_slli_epi32( (r), 32-(-(c)) )) ) + : _mm_xor_si128(_mm_srli_epi32((r), -(c)), \ + _mm_slli_epi32((r), 32-(-(c))))) #endif #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ - row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ - row4 = _mm_xor_si128( row4, row1 ); \ + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf), row2); \ + row4 = _mm_xor_si128(row4, row1); \ row4 = MM_ROTI_EPI32(row4, -16); \ - row3 = _mm_add_epi32( row3, row4 ); \ - row2 = _mm_xor_si128( row2, row3 ); \ + row3 = _mm_add_epi32(row3, row4); \ + row2 = _mm_xor_si128(row2, row3); \ row2 = MM_ROTI_EPI32(row2, -12); #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ - row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ - row4 = _mm_xor_si128( row4, row1 ); \ + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf), row2); \ + row4 = _mm_xor_si128(row4, row1); \ row4 = MM_ROTI_EPI32(row4, -8); \ - row3 = _mm_add_epi32( row3, row4 ); \ - row2 = _mm_xor_si128( row2, row3 ); \ + row3 = _mm_add_epi32(row3, row4); \ + row2 = _mm_xor_si128(row2, row3); \ row2 = MM_ROTI_EPI32(row2, -7); #define DIAGONALIZE(row1,row2,row3,row4) \ - row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \ - row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ - row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); \ + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); \ + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); #define UNDIAGONALIZE(row1,row2,row3,row4) \ - row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \ - row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ - row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); \ + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); \ + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); #define BLAKE2S_ROUND(r) \ BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ @@ -324,32 +324,32 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2s_State& state) __m128i buf1, buf2, buf3, buf4; __m128i t0, t1, t2, ff0, ff1; - const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); - const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); + const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); + const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - const __m128i m0 = LOADU( input + 00 ); - const __m128i m1 = LOADU( input + 16 ); - const __m128i m2 = LOADU( input + 32 ); - const __m128i m3 = LOADU( input + 48 ); + const __m128i m0 = LOADU(input + 00); + const __m128i m1 = LOADU(input + 16); + const __m128i m2 = LOADU(input + 32); + const __m128i m3 = LOADU(input + 48); - row1 = ff0 = LOADU( &state.h[0] ); - row2 = ff1 = LOADU( &state.h[4] ); - row3 = LOADU( &BLAKE2S_IV[0] ); - row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.tf[0] ) ); + row1 = ff0 = LOADU(&state.h[0]); + row2 = ff1 = LOADU(&state.h[4]); + row3 = LOADU(&BLAKE2S_IV[0]); + row4 = _mm_xor_si128(LOADU(&BLAKE2S_IV[4]), LOADU(&state.tf[0])); - BLAKE2S_ROUND( 0 ); - BLAKE2S_ROUND( 1 ); - BLAKE2S_ROUND( 2 ); - BLAKE2S_ROUND( 3 ); - BLAKE2S_ROUND( 4 ); - BLAKE2S_ROUND( 5 ); - BLAKE2S_ROUND( 6 ); - BLAKE2S_ROUND( 7 ); - BLAKE2S_ROUND( 8 ); - BLAKE2S_ROUND( 9 ); + BLAKE2S_ROUND(0); + BLAKE2S_ROUND(1); + BLAKE2S_ROUND(2); + BLAKE2S_ROUND(3); + BLAKE2S_ROUND(4); + BLAKE2S_ROUND(5); + BLAKE2S_ROUND(6); + BLAKE2S_ROUND(7); + BLAKE2S_ROUND(8); + BLAKE2S_ROUND(9); - STOREU( &state.h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); - STOREU( &state.h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); + STOREU(&state.h[0], _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); + STOREU(&state.h[4], _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); } #endif // CRYPTOPP_SSE41_AVAILABLE @@ -643,10 +643,10 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state) BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \ } while(0) - const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8((input + 00))); - const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8((input + 16))); - const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8((input + 32))); - const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8((input + 48))); + const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8(input + 00)); + const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8(input + 16)); + const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8(input + 32)); + const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8(input + 48)); uint32x4_t row1, row2, row3, row4; @@ -947,19 +947,19 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) #define vec_ror_7(x) vec_rl(x, ROR7_MASK) #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ - row1 = vec_add( vec_add( row1, buf), row2 ); \ - row4 = vec_xor( row4, row1 ); \ + row1 = vec_add(vec_add(row1, buf), row2); \ + row4 = vec_xor(row4, row1); \ row4 = vec_ror_16(row4); \ - row3 = vec_add( row3, row4 ); \ - row2 = vec_xor( row2, row3 ); \ + row3 = vec_add(row3, row4); \ + row2 = vec_xor(row2, row3); \ row2 = vec_ror_12(row2); #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ - row1 = vec_add( vec_add( row1, buf), row2 ); \ - row4 = vec_xor( row4, row1 ); \ + row1 = vec_add(vec_add(row1, buf), row2); \ + row4 = vec_xor(row4, row1); \ row4 = vec_ror_8(row4); \ - row3 = vec_add( row3, row4 ); \ - row2 = vec_xor( row2, row3 ); \ + row3 = vec_add(row3, row4); \ + row2 = vec_xor(row2, row3); \ row2 = vec_ror_7(row2); const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11}; @@ -967,14 +967,14 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3}; #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ - row4 = vec_perm( row4, row4, D2103_MASK ); \ - row3 = vec_perm( row3, row3, D1032_MASK ); \ - row2 = vec_perm( row2, row2, D0321_MASK ); + row4 = vec_perm(row4, row4, D2103_MASK); \ + row3 = vec_perm(row3, row3, D1032_MASK); \ + row2 = vec_perm(row2, row2, D0321_MASK); #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ - row4 = vec_perm( row4, row4, D0321_MASK ); \ - row3 = vec_perm( row3, row3, D1032_MASK ); \ - row2 = vec_perm( row2, row2, D2103_MASK ); + row4 = vec_perm(row4, row4, D0321_MASK); \ + row3 = vec_perm(row3, row3, D1032_MASK); \ + row2 = vec_perm(row2, row2, D2103_MASK); #define BLAKE2S_ROUND(r) \ BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ @@ -997,24 +997,24 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) const uint32x4_p m8 = VectorLoad32LE(input + 32); const uint32x4_p m12 = VectorLoad32LE(input + 48); - row1 = ff0 = VectorLoad32LE( &state.h[0] ); - row2 = ff1 = VectorLoad32LE( &state.h[4] ); - row3 = VectorLoad32( &BLAKE2S_IV[0] ); - row4 = vec_xor( VectorLoad32( &BLAKE2S_IV[4] ), VectorLoad32( &state.tf[0] ) ); + row1 = ff0 = VectorLoad32LE(&state.h[0]); + row2 = ff1 = VectorLoad32LE(&state.h[4]); + row3 = VectorLoad32(&BLAKE2S_IV[0]); + row4 = vec_xor(VectorLoad32(&BLAKE2S_IV[4]), VectorLoad32(&state.tf[0])); - BLAKE2S_ROUND( 0 ); - BLAKE2S_ROUND( 1 ); - BLAKE2S_ROUND( 2 ); - BLAKE2S_ROUND( 3 ); - BLAKE2S_ROUND( 4 ); - BLAKE2S_ROUND( 5 ); - BLAKE2S_ROUND( 6 ); - BLAKE2S_ROUND( 7 ); - BLAKE2S_ROUND( 8 ); - BLAKE2S_ROUND( 9 ); + BLAKE2S_ROUND(0); + BLAKE2S_ROUND(1); + BLAKE2S_ROUND(2); + BLAKE2S_ROUND(3); + BLAKE2S_ROUND(4); + BLAKE2S_ROUND(5); + BLAKE2S_ROUND(6); + BLAKE2S_ROUND(7); + BLAKE2S_ROUND(8); + BLAKE2S_ROUND(9); - VectorStore32LE( &state.h[0], vec_xor( ff0, vec_xor( row1, row3 ) ) ); - VectorStore32LE( &state.h[4], vec_xor( ff1, vec_xor( row2, row4 ) ) ); + VectorStore32LE(&state.h[0], vec_xor(ff0, vec_xor(row1, row3))); + VectorStore32LE(&state.h[4], vec_xor(ff1, vec_xor(row2, row4))); } #endif // CRYPTOPP_POWER7_AVAILABLE