Align LSH IV's for AVX

2024-11-23 09:59:42 +00:00 · 2021-04-21 03:24:15 -04:00 · 2021-04-21 03:24:15 -04:00 · 42097e2798
commit 42097e2798
parent 75e0b4979b
2 changed files with 21 additions and 16 deletions
--- a/lsh256.cpp
+++ b/lsh256.cpp
@ -120,7 +120,9 @@ struct LSH256_Internal
 };

 #if defined(CRYPTOPP_LSH256_AVX_AVAILABLE)
-// Clear upper bits on entry and exit
+// Zero the upper 128 bits of all YMM registers
+// on entry and exit. It avoids AVX state
+// transition penalties when saving state.
 struct AVX_Cleanup
 {
 	AVX_Cleanup() {
@ -210,13 +212,13 @@ lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) {
 # define MAYBE_CONSTEXPR const
 #endif

-CRYPTOPP_ALIGN_DATA(16)
+CRYPTOPP_ALIGN_DATA(32)
 MAYBE_CONSTEXPR lsh_u32 g_IV224[CV_WORD_LEN] = {
 	0x068608D3, 0x62D8F7A7, 0xD76652AB, 0x4C600A43, 0xBDC40AA8, 0x1ECA0B68, 0xDA1A89BE, 0x3147D354,
 	0x707EB4F9, 0xF65B3862, 0x6B0B2ABE, 0x56B8EC0A, 0xCF237286, 0xEE0D1727, 0x33636595, 0x8BB8D05F,
 };

-CRYPTOPP_ALIGN_DATA(16)
+CRYPTOPP_ALIGN_DATA(32)
 MAYBE_CONSTEXPR lsh_u32 g_IV256[CV_WORD_LEN] = {
 	0x46a10f1f, 0xfddce486, 0xb41443a8, 0x198e6b9d, 0x3304388d, 0xb0f5a3c7, 0xb36061c4, 0x7adbd553,
 	0x105d5378, 0x2f74de54, 0x5c2f2d95, 0xf2553fbe, 0x8051357a, 0x138668c8, 0x47aa4484, 0xe01afb41
@ -825,14 +827,14 @@ inline void compress(LSH256_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_B

 inline void load_iv(word32* cv_l, word32* cv_r, const word32* iv)
 {
+	// The IV's are 32-byte aligned so we can use aligned loads.
 #if defined(CRYPTOPP_LSH256_AVX_AVAILABLE)
 	_mm256_storeu_si256(M256_CAST(cv_l+0),
-		_mm256_loadu_si256(CONST_M256_CAST(iv+0)));
+		_mm256_load_si256(CONST_M256_CAST(iv+0)));
 	_mm256_storeu_si256(M256_CAST(cv_r+0),
-		_mm256_loadu_si256(CONST_M256_CAST(iv+8)));
+		_mm256_load_si256(CONST_M256_CAST(iv+8)));

 #elif defined(CRYPTOPP_LSH256_SSE2_AVAILABLE)
-	// The IV's are 16-byte aligned so we can use _mm_load_si128.
 	_mm_storeu_si128(M128_CAST(cv_l+ 0),
 		_mm_load_si128(CONST_M128_CAST(iv+ 0)));
 	_mm_storeu_si128(M128_CAST(cv_l+ 4),
--- a/lsh512.cpp
+++ b/lsh512.cpp
@ -122,7 +122,9 @@ struct LSH512_Internal
 };

 #if defined(CRYPTOPP_LSH512_AVX_AVAILABLE)
-// Clear upper bits on entry and exit
+// Zero the upper 128 bits of all YMM registers
+// on entry and exit. It avoids AVX state
+// transition penalties when saving state.
 struct AVX_Cleanup
 {
 	AVX_Cleanup() {
@ -204,7 +206,7 @@ lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
 # define MAYBE_CONSTEXPR const
 #endif

-CRYPTOPP_ALIGN_DATA(16)
+CRYPTOPP_ALIGN_DATA(32)
 MAYBE_CONSTEXPR lsh_u64 g_IV224[CV_WORD_LEN] = {
 	W64LIT(0x0C401E9FE8813A55), W64LIT(0x4A5F446268FD3D35), W64LIT(0xFF13E452334F612A), W64LIT(0xF8227661037E354A),
 	W64LIT(0xA5F223723C9CA29D), W64LIT(0x95D965A11AED3979), W64LIT(0x01E23835B9AB02CC), W64LIT(0x52D49CBAD5B30616),
@ -212,7 +214,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV224[CV_WORD_LEN] = {
 	W64LIT(0x31E2B67D25BE3813), W64LIT(0xD522C4DEED8E4D83), W64LIT(0xA79F5509B43FBAFE), W64LIT(0xE00D2CD88B4B6C6A),
 };

-CRYPTOPP_ALIGN_DATA(16)
+CRYPTOPP_ALIGN_DATA(32)
 MAYBE_CONSTEXPR lsh_u64 g_IV256[CV_WORD_LEN] = {
 	W64LIT(0x6DC57C33DF989423), W64LIT(0xD8EA7F6E8342C199), W64LIT(0x76DF8356F8603AC4), W64LIT(0x40F1B44DE838223A),
 	W64LIT(0x39FFE7CFC31484CD), W64LIT(0x39C4326CC5281548), W64LIT(0x8A2FF85A346045D8), W64LIT(0xFF202AA46DBDD61E),
@ -220,7 +222,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV256[CV_WORD_LEN] = {
 	W64LIT(0xB596875BF8FF6DBA), W64LIT(0xFCCA39B089EF4615), W64LIT(0xECFF4017D020B4B6), W64LIT(0x7E77384C772ED802),
 };

-CRYPTOPP_ALIGN_DATA(16)
+CRYPTOPP_ALIGN_DATA(32)
 MAYBE_CONSTEXPR lsh_u64 g_IV384[CV_WORD_LEN] = {
 	W64LIT(0x53156A66292808F6), W64LIT(0xB2C4F362B204C2BC), W64LIT(0xB84B7213BFA05C4E), W64LIT(0x976CEB7C1B299F73),
 	W64LIT(0xDF0CC63C0570AE97), W64LIT(0xDA4441BAA486CE3F), W64LIT(0x6559F5D9B5F2ACC2), W64LIT(0x22DACF19B4B52A16),
@ -228,7 +230,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV384[CV_WORD_LEN] = {
 	W64LIT(0xBB08043FB34E3E30), W64LIT(0xA0DEC48D54618EAD), W64LIT(0x150317267464BC57), W64LIT(0x32D1501FDE63DC93)
 };

-CRYPTOPP_ALIGN_DATA(16)
+CRYPTOPP_ALIGN_DATA(32)
 MAYBE_CONSTEXPR lsh_u64 g_IV512[CV_WORD_LEN] = {
 	W64LIT(0xadd50f3c7f07094e), W64LIT(0xe3f3cee8f9418a4f), W64LIT(0xb527ecde5b3d0ae9), W64LIT(0x2ef6dec68076f501),
 	W64LIT(0x8cb994cae5aca216), W64LIT(0xfbb9eae4bba48cc7), W64LIT(0x650a526174725fea), W64LIT(0x1f9a61a73f8d8085),
@ -1077,18 +1079,19 @@ inline void compress(LSH512_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_B

 inline void load_iv(word64* cv_l, word64* cv_r, const word64* iv)
 {
+	// The IV's are 32-byte aligned so we can use aligned loads.
+
 #if defined(CRYPTOPP_LSH512_AVX_AVAILABLE)
 	_mm256_storeu_si256(M256_CAST(cv_l+0),
-		_mm256_loadu_si256(CONST_M256_CAST(iv+0)));
+		_mm256_load_si256(CONST_M256_CAST(iv+0)));
 	_mm256_storeu_si256(M256_CAST(cv_l+4),
-		_mm256_loadu_si256(CONST_M256_CAST(iv+4)));
+		_mm256_load_si256(CONST_M256_CAST(iv+4)));
 	_mm256_storeu_si256(M256_CAST(cv_r+0),
-		_mm256_loadu_si256(CONST_M256_CAST(iv+8)));
+		_mm256_load_si256(CONST_M256_CAST(iv+8)));
 	_mm256_storeu_si256(M256_CAST(cv_r+4),
-		_mm256_loadu_si256(CONST_M256_CAST(iv+12)));
+		_mm256_load_si256(CONST_M256_CAST(iv+12)));

 #elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE)
-	// The IV's are 16-byte aligned so we can use _mm_load_si128.
 	_mm_storeu_si128(M128_CAST(cv_l+0),
 		_mm_load_si128(CONST_M128_CAST(iv+0)));
 	_mm_storeu_si128(M128_CAST(cv_l+2),