diff --git a/sha.cpp b/sha.cpp index 3499699f..ce4be602 100644 --- a/sha.cpp +++ b/sha.cpp @@ -1,7 +1,8 @@ // sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c // Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton -// implemented Intel SHA extensions. All are in the public domain. +// implemented Intel SHA extensions based on Intel articles and code by +// Sean Gulley. All code is in the public domain. // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code @@ -29,6 +30,10 @@ NAMESPACE_BEGIN(CryptoPP) +// Function pointer for specific SHA1 or SHA256 Transform function +typedef void (*pfnSHATransform)(word32 *state, const word32 *data); +typedef void (*pfnSHAHashBlocks)(word32 *state, const word32 *data, size_t length); + //////////////////////////////// // start of Steve Reid's code // //////////////////////////////// @@ -91,7 +96,8 @@ static void SHA1_CXX_Transform(word32 *state, const word32 *data) ////////////////////////////// #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE -static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data) +// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. +static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data) { __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; __m128i MASK, MSG0, MSG1, MSG2, MSG3; @@ -272,17 +278,15 @@ static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data) // Save state ABCD = _mm_shuffle_epi32(ABCD, 0x1B); _mm_storeu_si128((__m128i*) state, ABCD); - *(state+4) = _mm_extract_epi32(E0, 3); + state[4] = _mm_extract_epi32(E0, 3); } #endif -typedef void (*pfnSHA1Transform)(word32 *state, const word32 *data); - -pfnSHA1Transform InitializeSHA1Transform() +pfnSHATransform InitializeSHA1Transform() { #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE if (HasSHA()) - return &SHA1_SHAEXT_Transform; + return &SHA1_SSE_SHA_Transform; else #endif @@ -300,7 +304,7 @@ void SHA1::InitState(HashWordType *state) void SHA1::Transform(word32 *state, const word32 *data) { - static const pfnSHA1Transform s_pfn = InitializeSHA1Transform(); + static const pfnSHATransform s_pfn = InitializeSHA1Transform(); s_pfn(state, data); } @@ -683,18 +687,25 @@ void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, } #endif +#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +static void SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t blocks); +#endif + #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM) +pfnSHAHashBlocks InitializeSHA256HashBlocks(); size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) { - X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2()); - return length % BLOCKSIZE; + static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks(); + s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2()); + return length % BLOCKSIZE; } size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) { - X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2()); - return length % BLOCKSIZE; + static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks(); + s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2()); + return length % BLOCKSIZE; } #endif @@ -724,7 +735,7 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) // Smaller but slower #if defined(__OPTIMIZE_SIZE__) -void SHA256::Transform(word32 *state, const word32 *data) +void SHA256_CXX_Transform(word32 *state, const word32 *data) { word32 T[20]; word32 W[32]; @@ -796,13 +807,13 @@ void SHA256::Transform(word32 *state, const word32 *data) state[7] += t[7]; } #else -void SHA256::Transform(word32 *state, const word32 *data) +void SHA256_CXX_Transform(word32 *state, const word32 *data) { word32 W[16]; #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM) // this byte reverse is a waste of time, but this function is only called by MDC - ByteReverse(W, data, BLOCKSIZE); - X86_SHA256_HashBlocks(state, W, BLOCKSIZE - !HasSSE2()); + ByteReverse(W, data, SHA256::BLOCKSIZE); + X86_SHA256_HashBlocks(state, W, SHA256::BLOCKSIZE - !HasSSE2()); #else word32 T[8]; /* Copy context->state[] to working vars */ @@ -834,6 +845,238 @@ void SHA256::Transform(word32 *state, const word32 *data) #undef s1 #undef R +#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +static void SHA256_SSE_SHA_Transform(word32 *state, const word32 *data) +{ + return SHA256_SSE_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE); +} + +// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. +static void SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length) +{ + CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length % SHA256::BLOCKSIZE == 0); + + __m128i STATE0, STATE1; + __m128i MSG, TMP, MASK; + __m128i TMSG0, TMSG1, TMSG2, TMSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + // Load initial hash values + TMP = _mm_loadu_si128((__m128i*) &state[0]); + STATE1 = _mm_loadu_si128((__m128i*) &state[4]); + MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203)); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH + + while (length) + { + // Save hash values for addition after rounds + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Rounds 0-3 + MSG = _mm_loadu_si128((__m128i*) data+0); + TMSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 4-7 + TMSG1 = _mm_loadu_si128((__m128i*) (data+4)); + TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 8-11 + TMSG2 = _mm_loadu_si128((__m128i*) (data+8)); + TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 12-15 + TMSG3 = _mm_loadu_si128((__m128i*) (data+12)); + TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 16-19 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 20-23 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 24-27 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 28-31 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 32-35 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 36-39 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 40-43 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 44-47 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 48-51 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 52-55 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 56-59 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 60-63 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Add current hash values with previously saved + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + data += 16; + length -= SHA256::BLOCKSIZE; + } + + // Write hash values back in the correct order + TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF + + _mm_storeu_si128((__m128i*) &state[0], STATE0); + _mm_storeu_si128((__m128i*) &state[4], STATE1); +} +#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE + +pfnSHATransform InitializeSHA256Transform() +{ +#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE + if (HasSHA()) + return &SHA256_SSE_SHA_Transform; + else +#endif + + return &SHA256_CXX_Transform; +} + +pfnSHAHashBlocks InitializeSHA256HashBlocks() +{ +#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE + if (HasSHA()) + return &SHA256_SSE_SHA_HashBlocks; + else +#endif + + return &X86_SHA256_HashBlocks; +} + +void SHA256::Transform(word32 *state, const word32 *data) +{ + static const pfnSHATransform s_pfn = InitializeSHA256Transform(); + s_pfn(state, data); +} + // ************************************************************* void SHA384::InitState(HashWordType *state)