Add Intel SHA1 extension support (Issue 139)

This commit is contained in:
Jeffrey Walton 2016-12-01 15:05:41 -05:00
parent 315b4b0b3e
commit cce56d3f79
No known key found for this signature in database
GPG Key ID: B36AB348921B1838

275
sha.cpp
View File

@ -1,7 +1,8 @@
// sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
// implemented Intel SHA extensions. All are in the public domain.
// implemented Intel SHA extensions based on Intel articles and code by
// Sean Gulley. All code is in the public domain.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
@ -29,6 +30,10 @@
NAMESPACE_BEGIN(CryptoPP)
// Function pointer for specific SHA1 or SHA256 Transform function
typedef void (*pfnSHATransform)(word32 *state, const word32 *data);
typedef void (*pfnSHAHashBlocks)(word32 *state, const word32 *data, size_t length);
////////////////////////////////
// start of Steve Reid's code //
////////////////////////////////
@ -91,7 +96,8 @@ static void SHA1_CXX_Transform(word32 *state, const word32 *data)
//////////////////////////////
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data)
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
{
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
__m128i MASK, MSG0, MSG1, MSG2, MSG3;
@ -272,17 +278,15 @@ static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data)
// Save state
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
_mm_storeu_si128((__m128i*) state, ABCD);
*(state+4) = _mm_extract_epi32(E0, 3);
state[4] = _mm_extract_epi32(E0, 3);
}
#endif
typedef void (*pfnSHA1Transform)(word32 *state, const word32 *data);
pfnSHA1Transform InitializeSHA1Transform()
pfnSHATransform InitializeSHA1Transform()
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
return &SHA1_SHAEXT_Transform;
return &SHA1_SSE_SHA_Transform;
else
#endif
@ -300,7 +304,7 @@ void SHA1::InitState(HashWordType *state)
void SHA1::Transform(word32 *state, const word32 *data)
{
static const pfnSHA1Transform s_pfn = InitializeSHA1Transform();
static const pfnSHATransform s_pfn = InitializeSHA1Transform();
s_pfn(state, data);
}
@ -683,18 +687,25 @@ void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data,
}
#endif
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
static void SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t blocks);
#endif
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
pfnSHAHashBlocks InitializeSHA256HashBlocks();
size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
{
X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
return length % BLOCKSIZE;
static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
return length % BLOCKSIZE;
}
size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
{
X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
return length % BLOCKSIZE;
static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
return length % BLOCKSIZE;
}
#endif
@ -724,7 +735,7 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
// Smaller but slower
#if defined(__OPTIMIZE_SIZE__)
void SHA256::Transform(word32 *state, const word32 *data)
void SHA256_CXX_Transform(word32 *state, const word32 *data)
{
word32 T[20];
word32 W[32];
@ -796,13 +807,13 @@ void SHA256::Transform(word32 *state, const word32 *data)
state[7] += t[7];
}
#else
void SHA256::Transform(word32 *state, const word32 *data)
void SHA256_CXX_Transform(word32 *state, const word32 *data)
{
word32 W[16];
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
// this byte reverse is a waste of time, but this function is only called by MDC
ByteReverse(W, data, BLOCKSIZE);
X86_SHA256_HashBlocks(state, W, BLOCKSIZE - !HasSSE2());
ByteReverse(W, data, SHA256::BLOCKSIZE);
X86_SHA256_HashBlocks(state, W, SHA256::BLOCKSIZE - !HasSSE2());
#else
word32 T[8];
/* Copy context->state[] to working vars */
@ -834,6 +845,238 @@ void SHA256::Transform(word32 *state, const word32 *data)
#undef s1
#undef R
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
static void SHA256_SSE_SHA_Transform(word32 *state, const word32 *data)
{
return SHA256_SSE_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
}
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
static void SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
{
CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length % SHA256::BLOCKSIZE == 0);
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial hash values
TMP = _mm_loadu_si128((__m128i*) &state[0]);
STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203));
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
while (length)
{
// Save hash values for addition after rounds
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
MSG = _mm_loadu_si128((__m128i*) data+0);
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_loadu_si128((__m128i*) (data+4));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_loadu_si128((__m128i*) (data+8));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_loadu_si128((__m128i*) (data+12));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add current hash values with previously saved
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
data += 16;
length -= SHA256::BLOCKSIZE;
}
// Write hash values back in the correct order
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
_mm_storeu_si128((__m128i*) &state[0], STATE0);
_mm_storeu_si128((__m128i*) &state[4], STATE1);
}
#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
pfnSHATransform InitializeSHA256Transform()
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
return &SHA256_SSE_SHA_Transform;
else
#endif
return &SHA256_CXX_Transform;
}
pfnSHAHashBlocks InitializeSHA256HashBlocks()
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
return &SHA256_SSE_SHA_HashBlocks;
else
#endif
return &X86_SHA256_HashBlocks;
}
void SHA256::Transform(word32 *state, const word32 *data)
{
static const pfnSHATransform s_pfn = InitializeSHA256Transform();
s_pfn(state, data);
}
// *************************************************************
void SHA384::InitState(HashWordType *state)