From 42097e279837ad6f084b7910cea5306c503be988 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 21 Apr 2021 03:24:15 -0400 Subject: [PATCH] Align LSH IV's for AVX --- lsh256.cpp | 14 ++++++++------ lsh512.cpp | 23 +++++++++++++---------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/lsh256.cpp b/lsh256.cpp index 324cfbb7..07814100 100644 --- a/lsh256.cpp +++ b/lsh256.cpp @@ -120,7 +120,9 @@ struct LSH256_Internal }; #if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) -// Clear upper bits on entry and exit +// Zero the upper 128 bits of all YMM registers +// on entry and exit. It avoids AVX state +// transition penalties when saving state. struct AVX_Cleanup { AVX_Cleanup() { @@ -210,13 +212,13 @@ lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) { # define MAYBE_CONSTEXPR const #endif -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u32 g_IV224[CV_WORD_LEN] = { 0x068608D3, 0x62D8F7A7, 0xD76652AB, 0x4C600A43, 0xBDC40AA8, 0x1ECA0B68, 0xDA1A89BE, 0x3147D354, 0x707EB4F9, 0xF65B3862, 0x6B0B2ABE, 0x56B8EC0A, 0xCF237286, 0xEE0D1727, 0x33636595, 0x8BB8D05F, }; -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u32 g_IV256[CV_WORD_LEN] = { 0x46a10f1f, 0xfddce486, 0xb41443a8, 0x198e6b9d, 0x3304388d, 0xb0f5a3c7, 0xb36061c4, 0x7adbd553, 0x105d5378, 0x2f74de54, 0x5c2f2d95, 0xf2553fbe, 0x8051357a, 0x138668c8, 0x47aa4484, 0xe01afb41 @@ -825,14 +827,14 @@ inline void compress(LSH256_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_B inline void load_iv(word32* cv_l, word32* cv_r, const word32* iv) { + // The IV's are 32-byte aligned so we can use aligned loads. #if defined(CRYPTOPP_LSH256_AVX_AVAILABLE) _mm256_storeu_si256(M256_CAST(cv_l+0), - _mm256_loadu_si256(CONST_M256_CAST(iv+0))); + _mm256_load_si256(CONST_M256_CAST(iv+0))); _mm256_storeu_si256(M256_CAST(cv_r+0), - _mm256_loadu_si256(CONST_M256_CAST(iv+8))); + _mm256_load_si256(CONST_M256_CAST(iv+8))); #elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE) - // The IV's are 16-byte aligned so we can use _mm_load_si128. _mm_storeu_si128(M128_CAST(cv_l+ 0), _mm_load_si128(CONST_M128_CAST(iv+ 0))); _mm_storeu_si128(M128_CAST(cv_l+ 4), diff --git a/lsh512.cpp b/lsh512.cpp index f6340314..b26c5718 100644 --- a/lsh512.cpp +++ b/lsh512.cpp @@ -122,7 +122,9 @@ struct LSH512_Internal }; #if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) -// Clear upper bits on entry and exit +// Zero the upper 128 bits of all YMM registers +// on entry and exit. It avoids AVX state +// transition penalties when saving state. struct AVX_Cleanup { AVX_Cleanup() { @@ -204,7 +206,7 @@ lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) { # define MAYBE_CONSTEXPR const #endif -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u64 g_IV224[CV_WORD_LEN] = { W64LIT(0x0C401E9FE8813A55), W64LIT(0x4A5F446268FD3D35), W64LIT(0xFF13E452334F612A), W64LIT(0xF8227661037E354A), W64LIT(0xA5F223723C9CA29D), W64LIT(0x95D965A11AED3979), W64LIT(0x01E23835B9AB02CC), W64LIT(0x52D49CBAD5B30616), @@ -212,7 +214,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV224[CV_WORD_LEN] = { W64LIT(0x31E2B67D25BE3813), W64LIT(0xD522C4DEED8E4D83), W64LIT(0xA79F5509B43FBAFE), W64LIT(0xE00D2CD88B4B6C6A), }; -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u64 g_IV256[CV_WORD_LEN] = { W64LIT(0x6DC57C33DF989423), W64LIT(0xD8EA7F6E8342C199), W64LIT(0x76DF8356F8603AC4), W64LIT(0x40F1B44DE838223A), W64LIT(0x39FFE7CFC31484CD), W64LIT(0x39C4326CC5281548), W64LIT(0x8A2FF85A346045D8), W64LIT(0xFF202AA46DBDD61E), @@ -220,7 +222,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV256[CV_WORD_LEN] = { W64LIT(0xB596875BF8FF6DBA), W64LIT(0xFCCA39B089EF4615), W64LIT(0xECFF4017D020B4B6), W64LIT(0x7E77384C772ED802), }; -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u64 g_IV384[CV_WORD_LEN] = { W64LIT(0x53156A66292808F6), W64LIT(0xB2C4F362B204C2BC), W64LIT(0xB84B7213BFA05C4E), W64LIT(0x976CEB7C1B299F73), W64LIT(0xDF0CC63C0570AE97), W64LIT(0xDA4441BAA486CE3F), W64LIT(0x6559F5D9B5F2ACC2), W64LIT(0x22DACF19B4B52A16), @@ -228,7 +230,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV384[CV_WORD_LEN] = { W64LIT(0xBB08043FB34E3E30), W64LIT(0xA0DEC48D54618EAD), W64LIT(0x150317267464BC57), W64LIT(0x32D1501FDE63DC93) }; -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u64 g_IV512[CV_WORD_LEN] = { W64LIT(0xadd50f3c7f07094e), W64LIT(0xe3f3cee8f9418a4f), W64LIT(0xb527ecde5b3d0ae9), W64LIT(0x2ef6dec68076f501), W64LIT(0x8cb994cae5aca216), W64LIT(0xfbb9eae4bba48cc7), W64LIT(0x650a526174725fea), W64LIT(0x1f9a61a73f8d8085), @@ -1077,18 +1079,19 @@ inline void compress(LSH512_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_B inline void load_iv(word64* cv_l, word64* cv_r, const word64* iv) { + // The IV's are 32-byte aligned so we can use aligned loads. + #if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) _mm256_storeu_si256(M256_CAST(cv_l+0), - _mm256_loadu_si256(CONST_M256_CAST(iv+0))); + _mm256_load_si256(CONST_M256_CAST(iv+0))); _mm256_storeu_si256(M256_CAST(cv_l+4), - _mm256_loadu_si256(CONST_M256_CAST(iv+4))); + _mm256_load_si256(CONST_M256_CAST(iv+4))); _mm256_storeu_si256(M256_CAST(cv_r+0), - _mm256_loadu_si256(CONST_M256_CAST(iv+8))); + _mm256_load_si256(CONST_M256_CAST(iv+8))); _mm256_storeu_si256(M256_CAST(cv_r+4), - _mm256_loadu_si256(CONST_M256_CAST(iv+12))); + _mm256_load_si256(CONST_M256_CAST(iv+12))); #elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - // The IV's are 16-byte aligned so we can use _mm_load_si128. _mm_storeu_si128(M128_CAST(cv_l+0), _mm_load_si128(CONST_M128_CAST(iv+0))); _mm_storeu_si128(M128_CAST(cv_l+2),