From eb91b05f6047e5d609973edc3ea1571550bff99d Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Fri, 16 Apr 2021 06:57:12 -0400 Subject: [PATCH] Use SSE2 code paths on Windows machines (GH #1025) --- lsh256.cpp | 26 +++++++++++++------------- lsh512.cpp | 26 +++++++++++++------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/lsh256.cpp b/lsh256.cpp index 76cd1524..a3d7ff21 100644 --- a/lsh256.cpp +++ b/lsh256.cpp @@ -10,7 +10,7 @@ #include "lsh.h" #include "misc.h" -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) # include # define M128_CAST(x) ((__m128i *)(void *)(x)) # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) @@ -193,7 +193,7 @@ inline void load_msg_blk(LSH256_Internal* i_state, const lsh_u32* msgblk) CRYPTOPP_ASSERT(i_state != NULLPTR); CRYPTOPP_ASSERT(msgblk != NULLPTR); -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(i_state->submsg_e_l+0), _mm_loadu_si128(CONST_M128_CAST(msgblk+0))); _mm_storeu_si128(M128_CAST(i_state->submsg_e_l+4), @@ -255,7 +255,7 @@ inline void msg_exp_even(LSH256_Internal* i_state) lsh_u32* submsg_o_l = i_state->submsg_o_l; lsh_u32* submsg_o_r = i_state->submsg_o_r; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi32( _mm_shuffle_epi32(_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(3,2,1,0)), _mm_shuffle_epi32(_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(1,0,2,3)))); @@ -305,7 +305,7 @@ inline void msg_exp_odd(LSH256_Internal* i_state) lsh_u32* submsg_o_l = i_state->submsg_o_l; lsh_u32* submsg_o_r = i_state->submsg_o_r; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi32( _mm_shuffle_epi32(_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(3,2,1,0)), _mm_shuffle_epi32(_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(1,0,2,3)))); @@ -362,7 +362,7 @@ inline void msg_add_even(lsh_u32* cv_l, lsh_u32* cv_r, LSH256_Internal* i_state) lsh_u32* submsg_e_l = i_state->submsg_e_l; lsh_u32* submsg_e_r = i_state->submsg_e_r; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(cv_l)), _mm_loadu_si128(CONST_M128_CAST(submsg_e_l)))); @@ -396,7 +396,7 @@ inline void msg_add_odd(lsh_u32* cv_l, lsh_u32* cv_r, LSH256_Internal* i_state) lsh_u32* submsg_o_l = i_state->submsg_o_l; lsh_u32* submsg_o_r = i_state->submsg_o_r; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(cv_l)), _mm_loadu_si128(CONST_M128_CAST(submsg_o_l)))); @@ -426,7 +426,7 @@ inline void add_blk(lsh_u32* cv_l, const lsh_u32* cv_r) CRYPTOPP_ASSERT(cv_l != NULLPTR); CRYPTOPP_ASSERT(cv_r != NULLPTR); -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi32( _mm_loadu_si128(CONST_M128_CAST(cv_l)), _mm_loadu_si128(CONST_M128_CAST(cv_r)))); @@ -455,7 +455,7 @@ inline void rotate_blk(lsh_u32 cv[8]) _mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R)); _mm_storeu_si128(M128_CAST(cv+4), _mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R)); -#elif defined(__SSE2__) +#elif defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv), _mm_or_si128( _mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R), _mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), 32-R))); @@ -479,7 +479,7 @@ inline void xor_with_const(lsh_u32* cv_l, const lsh_u32* const_v) CRYPTOPP_ASSERT(cv_l != NULLPTR); CRYPTOPP_ASSERT(const_v != NULLPTR); -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(cv_l)), _mm_loadu_si128(CONST_M128_CAST(const_v)))); @@ -515,7 +515,7 @@ inline void word_perm(lsh_u32* cv_l, lsh_u32* cv_r) CRYPTOPP_ASSERT(cv_l != NULLPTR); CRYPTOPP_ASSERT(cv_r != NULLPTR); -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l+0), _mm_shuffle_epi32( _mm_loadu_si128(CONST_M128_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2))); _mm_storeu_si128(M128_CAST(cv_l+4), _mm_shuffle_epi32( @@ -622,7 +622,7 @@ inline void compress(LSH256_Context* ctx, const lsh_u32 pdMsgBlk[MSG_BLK_WORD_LE inline void load_iv(word32* cv_l, word32* cv_r, const word32* iv) { -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) // The IV's are aligned so we can use _mm_load_si128. _mm_storeu_si128(M128_CAST(cv_l+ 0), _mm_load_si128(CONST_M128_CAST(iv+ 0))); _mm_storeu_si128(M128_CAST(cv_l+ 4), _mm_load_si128(CONST_M128_CAST(iv+ 4))); @@ -670,7 +670,7 @@ inline void fin(LSH256_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)), _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0)))); @@ -854,7 +854,7 @@ NAMESPACE_BEGIN(CryptoPP) std::string LSH256_Base::AlgorithmProvider() const { -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) return "SSE2"; #else return "C++"; diff --git a/lsh512.cpp b/lsh512.cpp index 09177fef..d27f488c 100644 --- a/lsh512.cpp +++ b/lsh512.cpp @@ -10,7 +10,7 @@ #include "lsh.h" #include "misc.h" -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) # include # define M128_CAST(x) ((__m128i *)(void *)(x)) # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) @@ -243,7 +243,7 @@ MAYBE_CONSTEXPR lsh_u64 g_StepConstants[16 * NUM_STEPS] = { inline void load_msg_blk(LSH512_Internal* i_state, const lsh_u64 * msgblk) { -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(i_state->submsg_e_l+0), _mm_loadu_si128(CONST_M128_CAST(msgblk+0))); _mm_storeu_si128(M128_CAST(i_state->submsg_e_l+2), @@ -324,7 +324,7 @@ inline void msg_exp_even(LSH512_Internal* i_state) lsh_u64* submsg_o_l = i_state->submsg_o_l; lsh_u64* submsg_o_r = i_state->submsg_o_r; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) __m128i temp; _mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_shuffle_epi32(_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), _MM_SHUFFLE(1,0,3,2))); @@ -405,7 +405,7 @@ inline void msg_exp_odd(LSH512_Internal* i_state) lsh_u64* submsg_o_l = i_state->submsg_o_l; lsh_u64* submsg_o_r = i_state->submsg_o_r; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) __m128i temp; _mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_shuffle_epi32( _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), _MM_SHUFFLE(1,0,3,2))); @@ -495,7 +495,7 @@ inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_st lsh_u64* submsg_e_l = i_state->submsg_e_l; lsh_u64* submsg_e_r = i_state->submsg_e_r; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(cv_l)), _mm_loadu_si128(CONST_M128_CAST(submsg_e_l)))); @@ -541,7 +541,7 @@ inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_sta lsh_u64* submsg_o_l = i_state->submsg_o_l; lsh_u64* submsg_o_r = i_state->submsg_o_r; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(cv_l)), _mm_loadu_si128(CONST_M128_CAST(submsg_o_l)))); @@ -580,7 +580,7 @@ inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_Internal* i_sta inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) { -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi64( _mm_loadu_si128(CONST_M128_CAST(cv_l)), _mm_loadu_si128(CONST_M128_CAST(cv_r)))); @@ -617,7 +617,7 @@ inline void rotate_blk(lsh_u64 cv[8]) _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R)); _mm_storeu_si128(M128_CAST(cv+6), _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R)); -#elif defined(__SSE2__) +#elif defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv), _mm_or_si128( _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R), _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), 64-R))); @@ -644,7 +644,7 @@ inline void rotate_blk(lsh_u64 cv[8]) inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64* const_v) { -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(cv_l)), _mm_loadu_si128(CONST_M128_CAST(const_v)))); @@ -682,7 +682,7 @@ inline void rotate_msg_gamma(lsh_u64 cv_r[8]) inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) { -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) __m128i temp[2]; temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); _mm_storeu_si128(M128_CAST(cv_l+0), _mm_unpacklo_epi64( @@ -809,7 +809,7 @@ inline void compress(LSH512_Context* ctx, const lsh_u64 pdMsgBlk[MSG_BLK_WORD_LE inline void load_iv(word64* cv_l, word64* cv_r, const word64* iv) { -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) // The IV's are aligned so we can use _mm_load_si128. _mm_storeu_si128(M128_CAST(cv_l+0), _mm_load_si128(CONST_M128_CAST(iv+0))); _mm_storeu_si128(M128_CAST(cv_l+2), _mm_load_si128(CONST_M128_CAST(iv+2))); @@ -877,7 +877,7 @@ inline void fin(LSH512_Context* ctx) { CRYPTOPP_ASSERT(ctx != NULLPTR); -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) _mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128( _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)), _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0)))); @@ -1069,7 +1069,7 @@ NAMESPACE_BEGIN(CryptoPP) std::string LSH512_Base::AlgorithmProvider() const { -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_M_X64) return "SSE2"; #else return "C++";