Whitespace check-in

This commit is contained in:
Jeffrey Walton 2018-11-03 16:42:35 -04:00
parent 6aa6393bf3
commit aee045912a
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 114 additions and 114 deletions

View File

@ -53,7 +53,7 @@ extern const word64 BLAKE2B_IV[8];
#if CRYPTOPP_SSE41_AVAILABLE
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void*)(p) )
#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p))
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r)
#define TOF(reg) _mm_castsi128_ps((reg))
#define TOI(reg) _mm_castps_si128((reg))
@ -439,48 +439,48 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2b_State& state)
__m128i b0, b1;
__m128i t0, t1;
const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
const __m128i m0 = LOADU( input + 00 );
const __m128i m1 = LOADU( input + 16 );
const __m128i m2 = LOADU( input + 32 );
const __m128i m3 = LOADU( input + 48 );
const __m128i m4 = LOADU( input + 64 );
const __m128i m5 = LOADU( input + 80 );
const __m128i m6 = LOADU( input + 96 );
const __m128i m7 = LOADU( input + 112 );
const __m128i m0 = LOADU(input + 00);
const __m128i m1 = LOADU(input + 16);
const __m128i m2 = LOADU(input + 32);
const __m128i m3 = LOADU(input + 48);
const __m128i m4 = LOADU(input + 64);
const __m128i m5 = LOADU(input + 80);
const __m128i m6 = LOADU(input + 96);
const __m128i m7 = LOADU(input + 112);
row1l = LOADU( &state.h[0] );
row1h = LOADU( &state.h[2] );
row2l = LOADU( &state.h[4] );
row2h = LOADU( &state.h[6] );
row3l = LOADU( &BLAKE2B_IV[0] );
row3h = LOADU( &BLAKE2B_IV[2] );
row4l = _mm_xor_si128( LOADU( &BLAKE2B_IV[4] ), LOADU( &state.tf[0] ) );
row4h = _mm_xor_si128( LOADU( &BLAKE2B_IV[6] ), LOADU( &state.tf[2] ) );
row1l = LOADU(&state.h[0]);
row1h = LOADU(&state.h[2]);
row2l = LOADU(&state.h[4]);
row2h = LOADU(&state.h[6]);
row3l = LOADU(&BLAKE2B_IV[0]);
row3h = LOADU(&BLAKE2B_IV[2]);
row4l = _mm_xor_si128(LOADU(&BLAKE2B_IV[4]), LOADU(&state.tf[0]));
row4h = _mm_xor_si128(LOADU(&BLAKE2B_IV[6]), LOADU(&state.tf[2]));
BLAKE2B_ROUND( 0 );
BLAKE2B_ROUND( 1 );
BLAKE2B_ROUND( 2 );
BLAKE2B_ROUND( 3 );
BLAKE2B_ROUND( 4 );
BLAKE2B_ROUND( 5 );
BLAKE2B_ROUND( 6 );
BLAKE2B_ROUND( 7 );
BLAKE2B_ROUND( 8 );
BLAKE2B_ROUND( 9 );
BLAKE2B_ROUND( 10 );
BLAKE2B_ROUND( 11 );
BLAKE2B_ROUND(0);
BLAKE2B_ROUND(1);
BLAKE2B_ROUND(2);
BLAKE2B_ROUND(3);
BLAKE2B_ROUND(4);
BLAKE2B_ROUND(5);
BLAKE2B_ROUND(6);
BLAKE2B_ROUND(7);
BLAKE2B_ROUND(8);
BLAKE2B_ROUND(9);
BLAKE2B_ROUND(10);
BLAKE2B_ROUND(11);
row1l = _mm_xor_si128( row3l, row1l );
row1h = _mm_xor_si128( row3h, row1h );
STOREU( &state.h[0], _mm_xor_si128( LOADU( &state.h[0] ), row1l ) );
STOREU( &state.h[2], _mm_xor_si128( LOADU( &state.h[2] ), row1h ) );
row2l = _mm_xor_si128( row4l, row2l );
row2h = _mm_xor_si128( row4h, row2h );
STOREU( &state.h[4], _mm_xor_si128( LOADU( &state.h[4] ), row2l ) );
STOREU( &state.h[6], _mm_xor_si128( LOADU( &state.h[6] ), row2h ) );
row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h);
STOREU(&state.h[0], _mm_xor_si128(LOADU(&state.h[0]), row1l));
STOREU(&state.h[2], _mm_xor_si128(LOADU(&state.h[2]), row1h));
row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h);
STOREU(&state.h[4], _mm_xor_si128(LOADU(&state.h[4]), row2l));
STOREU(&state.h[6], _mm_xor_si128(LOADU(&state.h[6]), row2h));
}
#endif // CRYPTOPP_SSE41_AVAILABLE
@ -633,11 +633,11 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state)
#define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
#define vrorq_n_u64_24(x) vcombine_u64(\
#define vrorq_n_u64_24(x) vcombine_u64( \
vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
#define vrorq_n_u64_16(x) vcombine_u64(\
#define vrorq_n_u64_16(x) vcombine_u64( \
vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))

View File

@ -53,7 +53,7 @@ extern const word64 BLAKE2B_IV[8];
#if CRYPTOPP_SSE41_AVAILABLE
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void*)(p) )
#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p))
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r)
#define TOF(reg) _mm_castsi128_ps((reg))
#define TOI(reg) _mm_castps_si128((reg))
@ -278,35 +278,35 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2s_State& state)
# define MM_ROTI_EPI32(r, c) ( \
(8==-(c)) ? _mm_shuffle_epi8(r,r8) \
: (16==-(c)) ? _mm_shuffle_epi8(r,r16) \
: _mm_xor_si128(_mm_srli_epi32( (r), -(c) ), \
_mm_slli_epi32( (r), 32-(-(c)) )) )
: _mm_xor_si128(_mm_srli_epi32((r), -(c)), \
_mm_slli_epi32((r), 32-(-(c)))))
#endif
#define BLAKE2S_G1(row1,row2,row3,row4,buf) \
row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
row4 = _mm_xor_si128( row4, row1 ); \
row1 = _mm_add_epi32(_mm_add_epi32(row1, buf), row2); \
row4 = _mm_xor_si128(row4, row1); \
row4 = MM_ROTI_EPI32(row4, -16); \
row3 = _mm_add_epi32( row3, row4 ); \
row2 = _mm_xor_si128( row2, row3 ); \
row3 = _mm_add_epi32(row3, row4); \
row2 = _mm_xor_si128(row2, row3); \
row2 = MM_ROTI_EPI32(row2, -12);
#define BLAKE2S_G2(row1,row2,row3,row4,buf) \
row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
row4 = _mm_xor_si128( row4, row1 ); \
row1 = _mm_add_epi32(_mm_add_epi32(row1, buf), row2); \
row4 = _mm_xor_si128(row4, row1); \
row4 = MM_ROTI_EPI32(row4, -8); \
row3 = _mm_add_epi32( row3, row4 ); \
row2 = _mm_xor_si128( row2, row3 ); \
row3 = _mm_add_epi32(row3, row4); \
row2 = _mm_xor_si128(row2, row3); \
row2 = MM_ROTI_EPI32(row2, -7);
#define DIAGONALIZE(row1,row2,row3,row4) \
row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \
row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) );
row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); \
row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); \
row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
#define UNDIAGONALIZE(row1,row2,row3,row4) \
row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \
row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) );
row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); \
row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); \
row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
#define BLAKE2S_ROUND(r) \
BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
@ -324,32 +324,32 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2s_State& state)
__m128i buf1, buf2, buf3, buf4;
__m128i t0, t1, t2, ff0, ff1;
const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 );
const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 );
const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i m0 = LOADU( input + 00 );
const __m128i m1 = LOADU( input + 16 );
const __m128i m2 = LOADU( input + 32 );
const __m128i m3 = LOADU( input + 48 );
const __m128i m0 = LOADU(input + 00);
const __m128i m1 = LOADU(input + 16);
const __m128i m2 = LOADU(input + 32);
const __m128i m3 = LOADU(input + 48);
row1 = ff0 = LOADU( &state.h[0] );
row2 = ff1 = LOADU( &state.h[4] );
row3 = LOADU( &BLAKE2S_IV[0] );
row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.tf[0] ) );
row1 = ff0 = LOADU(&state.h[0]);
row2 = ff1 = LOADU(&state.h[4]);
row3 = LOADU(&BLAKE2S_IV[0]);
row4 = _mm_xor_si128(LOADU(&BLAKE2S_IV[4]), LOADU(&state.tf[0]));
BLAKE2S_ROUND( 0 );
BLAKE2S_ROUND( 1 );
BLAKE2S_ROUND( 2 );
BLAKE2S_ROUND( 3 );
BLAKE2S_ROUND( 4 );
BLAKE2S_ROUND( 5 );
BLAKE2S_ROUND( 6 );
BLAKE2S_ROUND( 7 );
BLAKE2S_ROUND( 8 );
BLAKE2S_ROUND( 9 );
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
STOREU( &state.h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
STOREU( &state.h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
STOREU(&state.h[0], _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
STOREU(&state.h[4], _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
}
#endif // CRYPTOPP_SSE41_AVAILABLE
@ -643,10 +643,10 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \
} while(0)
const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8((input + 00)));
const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8((input + 16)));
const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8((input + 32)));
const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8((input + 48)));
const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8(input + 00));
const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8(input + 16));
const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8(input + 32));
const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8(input + 48));
uint32x4_t row1, row2, row3, row4;
@ -947,19 +947,19 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
#define vec_ror_7(x) vec_rl(x, ROR7_MASK)
#define BLAKE2S_G1(row1,row2,row3,row4,buf) \
row1 = vec_add( vec_add( row1, buf), row2 ); \
row4 = vec_xor( row4, row1 ); \
row1 = vec_add(vec_add(row1, buf), row2); \
row4 = vec_xor(row4, row1); \
row4 = vec_ror_16(row4); \
row3 = vec_add( row3, row4 ); \
row2 = vec_xor( row2, row3 ); \
row3 = vec_add(row3, row4); \
row2 = vec_xor(row2, row3); \
row2 = vec_ror_12(row2);
#define BLAKE2S_G2(row1,row2,row3,row4,buf) \
row1 = vec_add( vec_add( row1, buf), row2 ); \
row4 = vec_xor( row4, row1 ); \
row1 = vec_add(vec_add(row1, buf), row2); \
row4 = vec_xor(row4, row1); \
row4 = vec_ror_8(row4); \
row3 = vec_add( row3, row4 ); \
row2 = vec_xor( row2, row3 ); \
row3 = vec_add(row3, row4); \
row2 = vec_xor(row2, row3); \
row2 = vec_ror_7(row2);
const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
@ -967,14 +967,14 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
#define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \
row4 = vec_perm( row4, row4, D2103_MASK ); \
row3 = vec_perm( row3, row3, D1032_MASK ); \
row2 = vec_perm( row2, row2, D0321_MASK );
row4 = vec_perm(row4, row4, D2103_MASK); \
row3 = vec_perm(row3, row3, D1032_MASK); \
row2 = vec_perm(row2, row2, D0321_MASK);
#define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \
row4 = vec_perm( row4, row4, D0321_MASK ); \
row3 = vec_perm( row3, row3, D1032_MASK ); \
row2 = vec_perm( row2, row2, D2103_MASK );
row4 = vec_perm(row4, row4, D0321_MASK); \
row3 = vec_perm(row3, row3, D1032_MASK); \
row2 = vec_perm(row2, row2, D2103_MASK);
#define BLAKE2S_ROUND(r) \
BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \
@ -997,24 +997,24 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
const uint32x4_p m8 = VectorLoad32LE(input + 32);
const uint32x4_p m12 = VectorLoad32LE(input + 48);
row1 = ff0 = VectorLoad32LE( &state.h[0] );
row2 = ff1 = VectorLoad32LE( &state.h[4] );
row3 = VectorLoad32( &BLAKE2S_IV[0] );
row4 = vec_xor( VectorLoad32( &BLAKE2S_IV[4] ), VectorLoad32( &state.tf[0] ) );
row1 = ff0 = VectorLoad32LE(&state.h[0]);
row2 = ff1 = VectorLoad32LE(&state.h[4]);
row3 = VectorLoad32(&BLAKE2S_IV[0]);
row4 = vec_xor(VectorLoad32(&BLAKE2S_IV[4]), VectorLoad32(&state.tf[0]));
BLAKE2S_ROUND( 0 );
BLAKE2S_ROUND( 1 );
BLAKE2S_ROUND( 2 );
BLAKE2S_ROUND( 3 );
BLAKE2S_ROUND( 4 );
BLAKE2S_ROUND( 5 );
BLAKE2S_ROUND( 6 );
BLAKE2S_ROUND( 7 );
BLAKE2S_ROUND( 8 );
BLAKE2S_ROUND( 9 );
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
VectorStore32LE( &state.h[0], vec_xor( ff0, vec_xor( row1, row3 ) ) );
VectorStore32LE( &state.h[4], vec_xor( ff1, vec_xor( row2, row4 ) ) );
VectorStore32LE(&state.h[0], vec_xor(ff0, vec_xor(row1, row3)));
VectorStore32LE(&state.h[4], vec_xor(ff1, vec_xor(row2, row4)));
}
#endif // CRYPTOPP_POWER7_AVAILABLE