2018-11-10 13:00:14 +00:00
|
|
|
// sha_simd.cpp - written and placed in the public domain by
|
2017-08-17 16:33:43 +00:00
|
|
|
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
|
|
|
|
//
|
|
|
|
// This source file uses intrinsics to gain access to SHA-NI and
|
|
|
|
// ARMv8a SHA instructions. A separate source file is needed
|
|
|
|
// because additional CXXFLAGS are required to enable the
|
|
|
|
// appropriate instructions sets in some build configurations.
|
|
|
|
|
|
|
|
#include "pch.h"
|
|
|
|
#include "config.h"
|
|
|
|
#include "sha.h"
|
|
|
|
#include "misc.h"
|
|
|
|
|
2018-08-18 08:44:53 +00:00
|
|
|
#if defined(CRYPTOPP_DISABLE_SHA_ASM)
|
|
|
|
# undef CRYPTOPP_X86_ASM_AVAILABLE
|
|
|
|
# undef CRYPTOPP_X32_ASM_AVAILABLE
|
|
|
|
# undef CRYPTOPP_X64_ASM_AVAILABLE
|
|
|
|
# undef CRYPTOPP_SSE2_ASM_AVAILABLE
|
|
|
|
#endif
|
|
|
|
|
2017-09-12 09:39:33 +00:00
|
|
|
#if (CRYPTOPP_SHANI_AVAILABLE)
|
2017-09-12 09:29:51 +00:00
|
|
|
# include <nmmintrin.h>
|
|
|
|
# include <immintrin.h>
|
2017-08-17 16:33:43 +00:00
|
|
|
#endif
|
|
|
|
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
2017-09-12 09:29:51 +00:00
|
|
|
# include <arm_neon.h>
|
2018-01-20 18:23:41 +00:00
|
|
|
#endif
|
|
|
|
|
2018-10-25 18:08:09 +00:00
|
|
|
#if (CRYPTOPP_ARM_ACLE_AVAILABLE)
|
2018-01-20 18:23:41 +00:00
|
|
|
# include <stdint.h>
|
|
|
|
# include <arm_acle.h>
|
2017-08-17 16:33:43 +00:00
|
|
|
#endif
|
|
|
|
|
2017-09-22 12:58:50 +00:00
|
|
|
#if CRYPTOPP_POWER8_SHA_AVAILABLE
|
2018-11-10 13:00:14 +00:00
|
|
|
# include "ppc_simd.h"
|
2017-09-22 12:58:50 +00:00
|
|
|
#endif
|
|
|
|
|
2017-08-17 16:33:43 +00:00
|
|
|
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
|
|
|
|
# include <signal.h>
|
|
|
|
# include <setjmp.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef EXCEPTION_EXECUTE_HANDLER
|
|
|
|
# define EXCEPTION_EXECUTE_HANDLER 1
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// Clang __m128i casts
|
|
|
|
#define M128_CAST(x) ((__m128i *)(void *)(x))
|
|
|
|
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
|
|
|
|
2018-07-06 05:22:38 +00:00
|
|
|
// Squash MS LNK4221 and libtool warnings
|
|
|
|
extern const char SHA_SIMD_FNAME[] = __FILE__;
|
|
|
|
|
2017-08-17 16:33:43 +00:00
|
|
|
NAMESPACE_BEGIN(CryptoPP)
|
|
|
|
|
2018-10-28 08:45:26 +00:00
|
|
|
// ***************** SHA key tables ********************
|
|
|
|
|
|
|
|
extern const word32 SHA256_K[64];
|
|
|
|
extern const word64 SHA512_K[80];
|
|
|
|
|
2017-08-19 05:35:36 +00:00
|
|
|
// ***************** SIGILL probes ********************
|
|
|
|
|
2017-08-17 16:33:43 +00:00
|
|
|
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
|
|
|
|
extern "C" {
|
|
|
|
typedef void (*SigHandler)(int);
|
|
|
|
|
|
|
|
static jmp_buf s_jmpSIGILL;
|
|
|
|
static void SigIllHandler(int)
|
|
|
|
{
|
|
|
|
longjmp(s_jmpSIGILL, 1);
|
|
|
|
}
|
2018-03-31 17:04:00 +00:00
|
|
|
}
|
2017-08-17 16:33:43 +00:00
|
|
|
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
|
|
|
|
|
|
|
|
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
|
|
|
|
bool CPU_ProbeSHA1()
|
|
|
|
{
|
2017-09-20 01:08:37 +00:00
|
|
|
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
|
2018-03-10 21:19:11 +00:00
|
|
|
return false;
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
|
2017-08-17 16:33:43 +00:00
|
|
|
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
|
|
|
|
volatile bool result = true;
|
|
|
|
__try
|
|
|
|
{
|
|
|
|
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
|
|
|
|
|
|
|
|
uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
|
|
|
|
uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
|
|
|
|
uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
|
|
|
|
uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
|
|
|
|
uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
|
|
|
|
|
|
|
|
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
|
|
|
|
}
|
|
|
|
__except (EXCEPTION_EXECUTE_HANDLER)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
# else
|
|
|
|
|
|
|
|
// longjmp and clobber warnings. Volatile is required.
|
|
|
|
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
|
|
|
|
volatile bool result = true;
|
|
|
|
|
|
|
|
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
|
|
|
|
if (oldHandler == SIG_ERR)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
volatile sigset_t oldMask;
|
|
|
|
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (setjmp(s_jmpSIGILL))
|
|
|
|
result = false;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
|
|
|
|
|
|
|
|
uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
|
|
|
|
uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
|
|
|
|
uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
|
|
|
|
uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
|
|
|
|
uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
|
|
|
|
|
|
|
|
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
|
|
|
|
}
|
|
|
|
|
|
|
|
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
|
|
|
|
signal(SIGILL, oldHandler);
|
|
|
|
return result;
|
|
|
|
# endif
|
|
|
|
#else
|
|
|
|
return false;
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#endif // CRYPTOPP_ARM_SHA1_AVAILABLE
|
2017-08-17 16:33:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool CPU_ProbeSHA2()
|
|
|
|
{
|
2017-09-20 01:08:37 +00:00
|
|
|
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
|
2018-03-10 21:19:11 +00:00
|
|
|
return false;
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
|
2017-08-17 16:33:43 +00:00
|
|
|
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
|
|
|
|
volatile bool result = true;
|
|
|
|
__try
|
|
|
|
{
|
|
|
|
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
|
|
|
|
|
|
|
|
uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
|
|
|
|
uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
|
|
|
|
uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
|
|
|
|
uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
|
|
|
|
|
|
|
|
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
|
|
|
|
}
|
|
|
|
__except (EXCEPTION_EXECUTE_HANDLER)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
#else
|
|
|
|
|
|
|
|
// longjmp and clobber warnings. Volatile is required.
|
|
|
|
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
|
|
|
|
volatile bool result = true;
|
|
|
|
|
|
|
|
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
|
|
|
|
if (oldHandler == SIG_ERR)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
volatile sigset_t oldMask;
|
|
|
|
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (setjmp(s_jmpSIGILL))
|
|
|
|
result = false;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
|
|
|
|
|
|
|
|
uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
|
|
|
|
uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
|
|
|
|
uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
|
|
|
|
uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
|
|
|
|
|
|
|
|
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
|
|
|
|
}
|
|
|
|
|
|
|
|
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
|
|
|
|
signal(SIGILL, oldHandler);
|
|
|
|
return result;
|
|
|
|
# endif
|
|
|
|
#else
|
|
|
|
return false;
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#endif // CRYPTOPP_ARM_SHA2_AVAILABLE
|
2017-08-17 16:33:43 +00:00
|
|
|
}
|
|
|
|
#endif // ARM32 or ARM64
|
|
|
|
|
2018-08-05 09:39:42 +00:00
|
|
|
#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64)
|
|
|
|
bool CPU_ProbeSHA256()
|
|
|
|
{
|
|
|
|
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
|
|
|
|
return false;
|
|
|
|
#elif (CRYPTOPP_POWER8_AVAILABLE)
|
|
|
|
# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
|
|
|
|
|
|
|
|
// longjmp and clobber warnings. Volatile is required.
|
|
|
|
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
|
|
|
|
volatile int result = false;
|
|
|
|
|
|
|
|
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
|
|
|
|
if (oldHandler == SIG_ERR)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
volatile sigset_t oldMask;
|
|
|
|
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (setjmp(s_jmpSIGILL))
|
|
|
|
result = false;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
byte r[16], z[16] = {0};
|
2018-11-19 07:28:29 +00:00
|
|
|
uint8x16_p x = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
2018-08-05 09:39:42 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
x = VecSHA256<0,0>(x);
|
|
|
|
x = VecSHA256<0,1>(x);
|
|
|
|
x = VecSHA256<1,0>(x);
|
|
|
|
x = VecSHA256<1,1>(x);
|
|
|
|
VecStore(x, r);
|
2018-08-05 09:39:42 +00:00
|
|
|
|
|
|
|
result = (0 == std::memcmp(r, z, 16));
|
|
|
|
}
|
|
|
|
|
|
|
|
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
|
|
|
|
signal(SIGILL, oldHandler);
|
|
|
|
return result;
|
|
|
|
# endif
|
|
|
|
#else
|
|
|
|
return false;
|
|
|
|
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CPU_ProbeSHA512()
|
|
|
|
{
|
|
|
|
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
|
|
|
|
return false;
|
|
|
|
#elif (CRYPTOPP_POWER8_AVAILABLE)
|
|
|
|
# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY)
|
|
|
|
|
|
|
|
// longjmp and clobber warnings. Volatile is required.
|
|
|
|
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
|
|
|
|
volatile int result = false;
|
|
|
|
|
|
|
|
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
|
|
|
|
if (oldHandler == SIG_ERR)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
volatile sigset_t oldMask;
|
|
|
|
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (setjmp(s_jmpSIGILL))
|
|
|
|
result = false;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
byte r[16], z[16] = {0};
|
|
|
|
uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0});
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
x = VecSHA512<0,0>(x);
|
|
|
|
x = VecSHA512<0,1>(x);
|
|
|
|
x = VecSHA512<1,0>(x);
|
|
|
|
x = VecSHA512<1,1>(x);
|
|
|
|
VecStore(x, r);
|
2018-08-05 09:39:42 +00:00
|
|
|
|
|
|
|
result = (0 == std::memcmp(r, z, 16));
|
|
|
|
}
|
|
|
|
|
|
|
|
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
|
|
|
|
signal(SIGILL, oldHandler);
|
|
|
|
return result;
|
|
|
|
# endif
|
|
|
|
#else
|
|
|
|
return false;
|
|
|
|
#endif // CRYPTOPP_POWER8_AVAILABLE
|
|
|
|
}
|
|
|
|
#endif // PPC32 or PPC64
|
|
|
|
|
2017-08-19 05:35:36 +00:00
|
|
|
// ***************** Intel x86 SHA ********************
|
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
/////////////////////////////////////
|
|
|
|
// start of Walton and Gulley code //
|
|
|
|
/////////////////////////////////////
|
2017-08-17 16:33:43 +00:00
|
|
|
|
|
|
|
#if CRYPTOPP_SHANI_AVAILABLE
|
|
|
|
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
|
|
|
|
void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
|
|
|
|
{
|
|
|
|
CRYPTOPP_ASSERT(state);
|
|
|
|
CRYPTOPP_ASSERT(data);
|
|
|
|
CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
|
|
|
|
|
|
|
|
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
|
|
|
|
__m128i MASK, MSG0, MSG1, MSG2, MSG3;
|
|
|
|
|
|
|
|
// Load initial values
|
|
|
|
ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
|
|
|
|
E0 = _mm_set_epi32(state[4], 0, 0, 0);
|
|
|
|
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
|
|
|
|
|
|
|
|
// IA-32 SHA is little endian, SHA::Transform is big endian,
|
|
|
|
// and SHA::HashMultipleBlocks can be either. ByteOrder
|
|
|
|
// allows us to avoid extra endian reversals. It saves 1.0 cpb.
|
|
|
|
MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
|
|
|
|
_mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
|
|
|
|
_mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
|
|
|
|
|
|
|
|
while (length >= SHA1::BLOCKSIZE)
|
|
|
|
{
|
|
|
|
// Save current hash
|
|
|
|
ABCD_SAVE = ABCD;
|
|
|
|
E0_SAVE = E0;
|
|
|
|
|
|
|
|
// Rounds 0-3
|
|
|
|
MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
|
|
|
|
MSG0 = _mm_shuffle_epi8(MSG0, MASK);
|
|
|
|
E0 = _mm_add_epi32(E0, MSG0);
|
|
|
|
E1 = ABCD;
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
|
|
|
|
|
|
|
|
// Rounds 4-7
|
|
|
|
MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
|
|
|
|
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
|
|
|
E0 = ABCD;
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
|
|
|
|
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
|
|
|
|
|
|
|
|
// Rounds 8-11
|
|
|
|
MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
|
|
|
|
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
|
|
|
E1 = ABCD;
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
|
|
|
|
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
|
|
|
|
MSG0 = _mm_xor_si128(MSG0, MSG2);
|
|
|
|
|
|
|
|
// Rounds 12-15
|
|
|
|
MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
|
|
|
|
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
|
|
|
E0 = ABCD;
|
|
|
|
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
|
|
|
|
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
|
|
|
|
MSG1 = _mm_xor_si128(MSG1, MSG3);
|
|
|
|
|
|
|
|
// Rounds 16-19
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG0);
|
|
|
|
E1 = ABCD;
|
|
|
|
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
|
|
|
|
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
|
|
|
|
MSG2 = _mm_xor_si128(MSG2, MSG0);
|
|
|
|
|
|
|
|
// Rounds 20-23
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
|
|
|
E0 = ABCD;
|
|
|
|
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
|
|
|
|
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
|
|
|
|
MSG3 = _mm_xor_si128(MSG3, MSG1);
|
|
|
|
|
|
|
|
// Rounds 24-27
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
|
|
|
E1 = ABCD;
|
|
|
|
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
|
|
|
|
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
|
|
|
|
MSG0 = _mm_xor_si128(MSG0, MSG2);
|
|
|
|
|
|
|
|
// Rounds 28-31
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
|
|
|
E0 = ABCD;
|
|
|
|
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
|
|
|
|
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
|
|
|
|
MSG1 = _mm_xor_si128(MSG1, MSG3);
|
|
|
|
|
|
|
|
// Rounds 32-35
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG0);
|
|
|
|
E1 = ABCD;
|
|
|
|
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
|
|
|
|
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
|
|
|
|
MSG2 = _mm_xor_si128(MSG2, MSG0);
|
|
|
|
|
|
|
|
// Rounds 36-39
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
|
|
|
E0 = ABCD;
|
|
|
|
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
|
|
|
|
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
|
|
|
|
MSG3 = _mm_xor_si128(MSG3, MSG1);
|
|
|
|
|
|
|
|
// Rounds 40-43
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
|
|
|
E1 = ABCD;
|
|
|
|
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
|
|
|
|
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
|
|
|
|
MSG0 = _mm_xor_si128(MSG0, MSG2);
|
|
|
|
|
|
|
|
// Rounds 44-47
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
|
|
|
E0 = ABCD;
|
|
|
|
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
|
|
|
|
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
|
|
|
|
MSG1 = _mm_xor_si128(MSG1, MSG3);
|
|
|
|
|
|
|
|
// Rounds 48-51
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG0);
|
|
|
|
E1 = ABCD;
|
|
|
|
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
|
|
|
|
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
|
|
|
|
MSG2 = _mm_xor_si128(MSG2, MSG0);
|
|
|
|
|
|
|
|
// Rounds 52-55
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
|
|
|
E0 = ABCD;
|
|
|
|
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
|
|
|
|
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
|
|
|
|
MSG3 = _mm_xor_si128(MSG3, MSG1);
|
|
|
|
|
|
|
|
// Rounds 56-59
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
|
|
|
E1 = ABCD;
|
|
|
|
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
|
|
|
|
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
|
|
|
|
MSG0 = _mm_xor_si128(MSG0, MSG2);
|
|
|
|
|
|
|
|
// Rounds 60-63
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
|
|
|
E0 = ABCD;
|
|
|
|
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
|
|
|
|
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
|
|
|
|
MSG1 = _mm_xor_si128(MSG1, MSG3);
|
|
|
|
|
|
|
|
// Rounds 64-67
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG0);
|
|
|
|
E1 = ABCD;
|
|
|
|
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
|
|
|
|
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
|
|
|
|
MSG2 = _mm_xor_si128(MSG2, MSG0);
|
|
|
|
|
|
|
|
// Rounds 68-71
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
|
|
|
E0 = ABCD;
|
|
|
|
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
|
|
|
|
MSG3 = _mm_xor_si128(MSG3, MSG1);
|
|
|
|
|
|
|
|
// Rounds 72-75
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
|
|
|
E1 = ABCD;
|
|
|
|
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
|
|
|
|
|
|
|
|
// Rounds 76-79
|
|
|
|
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
|
|
|
E0 = ABCD;
|
|
|
|
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
|
|
|
|
|
|
|
|
// Add values back to state
|
|
|
|
E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
|
|
|
|
ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
|
|
|
|
|
|
|
|
data += SHA1::BLOCKSIZE/sizeof(word32);
|
|
|
|
length -= SHA1::BLOCKSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save state
|
|
|
|
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
|
|
|
|
_mm_storeu_si128(M128_CAST(state), ABCD);
|
|
|
|
state[4] = _mm_extract_epi32(E0, 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
|
|
|
|
void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
|
|
|
|
{
|
|
|
|
CRYPTOPP_ASSERT(state);
|
|
|
|
CRYPTOPP_ASSERT(data);
|
|
|
|
CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
|
|
|
|
|
|
|
|
__m128i STATE0, STATE1;
|
|
|
|
__m128i MSG, TMP, MASK;
|
|
|
|
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
|
|
|
__m128i ABEF_SAVE, CDGH_SAVE;
|
|
|
|
|
|
|
|
// Load initial values
|
|
|
|
TMP = _mm_loadu_si128(M128_CAST(&state[0]));
|
|
|
|
STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
|
|
|
|
|
|
|
|
// IA-32 SHA is little endian, SHA::Transform is big endian,
|
|
|
|
// and SHA::HashMultipleBlocks can be either. ByteOrder
|
|
|
|
// allows us to avoid extra endian reversals. It saves 1.0 cpb.
|
|
|
|
MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
|
|
|
|
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
|
|
|
|
_mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
|
|
|
|
|
|
|
|
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
|
|
|
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
|
|
|
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
|
|
|
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
|
|
|
|
|
|
|
while (length >= SHA256::BLOCKSIZE)
|
|
|
|
{
|
|
|
|
// Save current hash
|
|
|
|
ABEF_SAVE = STATE0;
|
|
|
|
CDGH_SAVE = STATE1;
|
|
|
|
|
|
|
|
// Rounds 0-3
|
|
|
|
MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
|
|
|
|
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
|
|
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
|
|
|
|
// Rounds 4-7
|
|
|
|
TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
|
|
|
|
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
|
|
|
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
|
|
|
|
|
|
|
// Rounds 8-11
|
|
|
|
TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
|
|
|
|
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
|
|
|
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
|
|
|
|
|
|
|
// Rounds 12-15
|
|
|
|
TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
|
|
|
|
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
|
|
|
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
|
|
|
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
|
|
|
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
|
|
|
|
|
|
|
// Rounds 16-19
|
|
|
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
|
|
|
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
|
|
|
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
|
|
|
|
|
|
|
// Rounds 20-23
|
|
|
|
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
|
|
|
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
|
|
|
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
|
|
|
|
|
|
|
// Rounds 24-27
|
|
|
|
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
|
|
|
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
|
|
|
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
|
|
|
|
|
|
|
// Rounds 28-31
|
|
|
|
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
|
|
|
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
|
|
|
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
|
|
|
|
|
|
|
// Rounds 32-35
|
|
|
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
|
|
|
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
|
|
|
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
|
|
|
|
|
|
|
// Rounds 36-39
|
|
|
|
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
|
|
|
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
|
|
|
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
|
|
|
|
|
|
|
// Rounds 40-43
|
|
|
|
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
|
|
|
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
|
|
|
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
|
|
|
|
|
|
|
// Rounds 44-47
|
|
|
|
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
|
|
|
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
|
|
|
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
|
|
|
|
|
|
|
// Rounds 48-51
|
|
|
|
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
|
|
|
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
|
|
|
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
|
|
|
|
|
|
|
// Rounds 52-55
|
|
|
|
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
|
|
|
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
|
|
|
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
|
|
|
|
// Rounds 56-59
|
|
|
|
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
|
|
|
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
|
|
|
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
|
|
|
|
// Rounds 60-63
|
|
|
|
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
|
|
|
|
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
|
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
|
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
|
|
|
|
|
// Add values back to state
|
|
|
|
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
|
|
|
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
|
|
|
|
|
|
|
data += SHA256::BLOCKSIZE/sizeof(word32);
|
|
|
|
length -= SHA256::BLOCKSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
|
|
|
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
|
|
|
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
|
|
|
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
|
|
|
|
|
|
|
// Save state
|
|
|
|
_mm_storeu_si128(M128_CAST(&state[0]), STATE0);
|
|
|
|
_mm_storeu_si128(M128_CAST(&state[4]), STATE1);
|
|
|
|
}
|
|
|
|
#endif // CRYPTOPP_SHANI_AVAILABLE
|
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
///////////////////////////////////
|
|
|
|
// end of Walton and Gulley code //
|
|
|
|
///////////////////////////////////
|
2017-08-17 16:33:43 +00:00
|
|
|
|
2017-08-19 05:35:36 +00:00
|
|
|
// ***************** ARMV8 SHA ********************
|
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
/////////////////////////////////////////////////////////////
|
|
|
|
// start of Walton, Schneiders, O'Rourke and Hovsmith code //
|
|
|
|
/////////////////////////////////////////////////////////////
|
2017-08-17 16:33:43 +00:00
|
|
|
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#if CRYPTOPP_ARM_SHA1_AVAILABLE
|
2017-08-17 16:33:43 +00:00
|
|
|
void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
|
|
|
|
{
|
|
|
|
CRYPTOPP_ASSERT(state);
|
|
|
|
CRYPTOPP_ASSERT(data);
|
|
|
|
CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
|
|
|
|
|
|
|
|
uint32x4_t C0, C1, C2, C3;
|
|
|
|
uint32x4_t ABCD, ABCD_SAVED;
|
|
|
|
uint32x4_t MSG0, MSG1, MSG2, MSG3;
|
|
|
|
uint32x4_t TMP0, TMP1;
|
|
|
|
uint32_t E0, E0_SAVED, E1;
|
|
|
|
|
|
|
|
// Load initial values
|
|
|
|
C0 = vdupq_n_u32(0x5A827999);
|
|
|
|
C1 = vdupq_n_u32(0x6ED9EBA1);
|
|
|
|
C2 = vdupq_n_u32(0x8F1BBCDC);
|
|
|
|
C3 = vdupq_n_u32(0xCA62C1D6);
|
|
|
|
|
|
|
|
ABCD = vld1q_u32(&state[0]);
|
|
|
|
E0 = state[4];
|
|
|
|
|
|
|
|
while (length >= SHA1::BLOCKSIZE)
|
|
|
|
{
|
|
|
|
// Save current hash
|
|
|
|
ABCD_SAVED = ABCD;
|
|
|
|
E0_SAVED = E0;
|
|
|
|
|
|
|
|
MSG0 = vld1q_u32(data + 0);
|
|
|
|
MSG1 = vld1q_u32(data + 4);
|
|
|
|
MSG2 = vld1q_u32(data + 8);
|
|
|
|
MSG3 = vld1q_u32(data + 12);
|
|
|
|
|
|
|
|
if (order == BIG_ENDIAN_ORDER) // Data arrangement
|
|
|
|
{
|
|
|
|
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
|
|
|
|
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
|
|
|
|
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
|
|
|
|
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
|
|
|
|
}
|
|
|
|
|
|
|
|
TMP0 = vaddq_u32(MSG0, C0);
|
|
|
|
TMP1 = vaddq_u32(MSG1, C0);
|
|
|
|
|
|
|
|
// Rounds 0-3
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1cq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG2, C0);
|
|
|
|
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
|
|
|
|
|
|
|
|
// Rounds 4-7
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1cq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG3, C0);
|
|
|
|
MSG0 = vsha1su1q_u32(MSG0, MSG3);
|
|
|
|
MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
|
|
|
|
|
|
|
|
// Rounds 8-11
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1cq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG0, C0);
|
|
|
|
MSG1 = vsha1su1q_u32(MSG1, MSG0);
|
|
|
|
MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
|
|
|
|
|
|
|
|
// Rounds 12-15
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1cq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG1, C1);
|
|
|
|
MSG2 = vsha1su1q_u32(MSG2, MSG1);
|
|
|
|
MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
|
|
|
|
|
|
|
|
// Rounds 16-19
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1cq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG2, C1);
|
|
|
|
MSG3 = vsha1su1q_u32(MSG3, MSG2);
|
|
|
|
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
|
|
|
|
|
|
|
|
// Rounds 20-23
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG3, C1);
|
|
|
|
MSG0 = vsha1su1q_u32(MSG0, MSG3);
|
|
|
|
MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
|
|
|
|
|
|
|
|
// Rounds 24-27
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG0, C1);
|
|
|
|
MSG1 = vsha1su1q_u32(MSG1, MSG0);
|
|
|
|
MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
|
|
|
|
|
|
|
|
// Rounds 28-31
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG1, C1);
|
|
|
|
MSG2 = vsha1su1q_u32(MSG2, MSG1);
|
|
|
|
MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
|
|
|
|
|
|
|
|
// Rounds 32-35
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG2, C2);
|
|
|
|
MSG3 = vsha1su1q_u32(MSG3, MSG2);
|
|
|
|
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
|
|
|
|
|
|
|
|
// Rounds 36-39
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG3, C2);
|
|
|
|
MSG0 = vsha1su1q_u32(MSG0, MSG3);
|
|
|
|
MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
|
|
|
|
|
|
|
|
// Rounds 40-43
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1mq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG0, C2);
|
|
|
|
MSG1 = vsha1su1q_u32(MSG1, MSG0);
|
|
|
|
MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
|
|
|
|
|
|
|
|
// Rounds 44-47
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1mq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG1, C2);
|
|
|
|
MSG2 = vsha1su1q_u32(MSG2, MSG1);
|
|
|
|
MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
|
|
|
|
|
|
|
|
// Rounds 48-51
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1mq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG2, C2);
|
|
|
|
MSG3 = vsha1su1q_u32(MSG3, MSG2);
|
|
|
|
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
|
|
|
|
|
|
|
|
// Rounds 52-55
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1mq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG3, C3);
|
|
|
|
MSG0 = vsha1su1q_u32(MSG0, MSG3);
|
|
|
|
MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
|
|
|
|
|
|
|
|
// Rounds 56-59
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1mq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG0, C3);
|
|
|
|
MSG1 = vsha1su1q_u32(MSG1, MSG0);
|
|
|
|
MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
|
|
|
|
|
|
|
|
// Rounds 60-63
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG1, C3);
|
|
|
|
MSG2 = vsha1su1q_u32(MSG2, MSG1);
|
|
|
|
MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
|
|
|
|
|
|
|
|
// Rounds 64-67
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E0, TMP0);
|
|
|
|
TMP0 = vaddq_u32(MSG2, C3);
|
|
|
|
MSG3 = vsha1su1q_u32(MSG3, MSG2);
|
|
|
|
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
|
|
|
|
|
|
|
|
// Rounds 68-71
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
|
|
|
|
TMP1 = vaddq_u32(MSG3, C3);
|
|
|
|
MSG0 = vsha1su1q_u32(MSG0, MSG3);
|
|
|
|
|
|
|
|
// Rounds 72-75
|
|
|
|
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E0, TMP0);
|
|
|
|
|
|
|
|
// Rounds 76-79
|
|
|
|
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
|
|
|
|
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
|
|
|
|
|
|
|
|
E0 += E0_SAVED;
|
|
|
|
ABCD = vaddq_u32(ABCD_SAVED, ABCD);
|
|
|
|
|
|
|
|
data += SHA1::BLOCKSIZE/sizeof(word32);
|
|
|
|
length -= SHA1::BLOCKSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save state
|
|
|
|
vst1q_u32(&state[0], ABCD);
|
|
|
|
state[4] = E0;
|
|
|
|
}
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#endif // CRYPTOPP_ARM_SHA1_AVAILABLE
|
2017-08-17 16:33:43 +00:00
|
|
|
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#if CRYPTOPP_ARM_SHA2_AVAILABLE
|
2017-08-17 16:33:43 +00:00
|
|
|
void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
|
|
|
|
{
|
|
|
|
CRYPTOPP_ASSERT(state);
|
|
|
|
CRYPTOPP_ASSERT(data);
|
|
|
|
CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
|
|
|
|
|
|
|
|
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
|
|
|
|
uint32x4_t MSG0, MSG1, MSG2, MSG3;
|
|
|
|
uint32x4_t TMP0, TMP1, TMP2;
|
|
|
|
|
|
|
|
// Load initial values
|
|
|
|
STATE0 = vld1q_u32(&state[0]);
|
|
|
|
STATE1 = vld1q_u32(&state[4]);
|
|
|
|
|
|
|
|
while (length >= SHA256::BLOCKSIZE)
|
|
|
|
{
|
|
|
|
// Save current hash
|
|
|
|
ABEF_SAVE = STATE0;
|
|
|
|
CDGH_SAVE = STATE1;
|
|
|
|
|
|
|
|
// Load message
|
|
|
|
MSG0 = vld1q_u32(data + 0);
|
|
|
|
MSG1 = vld1q_u32(data + 4);
|
|
|
|
MSG2 = vld1q_u32(data + 8);
|
|
|
|
MSG3 = vld1q_u32(data + 12);
|
|
|
|
|
|
|
|
if (order == BIG_ENDIAN_ORDER) // Data arrangement
|
|
|
|
{
|
|
|
|
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
|
|
|
|
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
|
|
|
|
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
|
|
|
|
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
|
|
|
|
}
|
|
|
|
|
|
|
|
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
|
|
|
|
|
|
|
|
// Rounds 0-3
|
|
|
|
MSG0 = vsha256su0q_u32(MSG0, MSG1);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
|
|
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
|
|
|
|
|
|
|
|
// Rounds 4-7
|
|
|
|
MSG1 = vsha256su0q_u32(MSG1, MSG2);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
|
|
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
|
|
|
|
|
|
|
|
// Rounds 8-11
|
|
|
|
MSG2 = vsha256su0q_u32(MSG2, MSG3);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
|
|
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
|
|
|
|
|
|
|
|
// Rounds 12-15
|
|
|
|
MSG3 = vsha256su0q_u32(MSG3, MSG0);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
|
|
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
|
|
|
|
|
|
|
|
// Rounds 16-19
|
|
|
|
MSG0 = vsha256su0q_u32(MSG0, MSG1);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
|
|
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
|
|
|
|
|
|
|
|
// Rounds 20-23
|
|
|
|
MSG1 = vsha256su0q_u32(MSG1, MSG2);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
|
|
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
|
|
|
|
|
|
|
|
// Rounds 24-27
|
|
|
|
MSG2 = vsha256su0q_u32(MSG2, MSG3);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
|
|
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
|
|
|
|
|
|
|
|
// Rounds 28-31
|
|
|
|
MSG3 = vsha256su0q_u32(MSG3, MSG0);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
|
|
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
|
|
|
|
|
|
|
|
// Rounds 32-35
|
|
|
|
MSG0 = vsha256su0q_u32(MSG0, MSG1);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
|
|
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
|
|
|
|
|
|
|
|
// Rounds 36-39
|
|
|
|
MSG1 = vsha256su0q_u32(MSG1, MSG2);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
|
|
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
|
|
|
|
|
|
|
|
// Rounds 40-43
|
|
|
|
MSG2 = vsha256su0q_u32(MSG2, MSG3);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
|
|
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
|
|
|
|
|
|
|
|
// Rounds 44-47
|
|
|
|
MSG3 = vsha256su0q_u32(MSG3, MSG0);
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
|
|
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
|
|
|
|
|
|
|
|
// Rounds 48-51
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
|
|
|
|
|
|
|
|
// Rounds 52-55
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
|
|
|
|
|
|
|
|
// Rounds 56-59
|
|
|
|
TMP2 = STATE0;
|
|
|
|
TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
|
|
|
|
|
|
|
|
// Rounds 60-63
|
|
|
|
TMP2 = STATE0;
|
|
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
|
|
|
|
|
|
|
|
// Add back to state
|
|
|
|
STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
|
|
|
|
STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
|
|
|
|
|
|
|
|
data += SHA256::BLOCKSIZE/sizeof(word32);
|
|
|
|
length -= SHA256::BLOCKSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save state
|
|
|
|
vst1q_u32(&state[0], STATE0);
|
|
|
|
vst1q_u32(&state[4], STATE1);
|
|
|
|
}
|
Add ARMv8.4 cpu feature detection support (GH #685) (#687)
This PR adds ARMv8.4 cpu feature detection support. Previously we only needed ARMv8.1 and things were much easier. For example, ARMv8.1 `__ARM_FEATURE_CRYPTO` meant PMULL, AES, SHA-1 and SHA-256 were available. ARMv8.4 `__ARM_FEATURE_CRYPTO` means PMULL, AES, SHA-1, SHA-256, SHA-512, SHA-3, SM3 and SM4 are available.
We still use the same pattern as before. We make something available based on compiler version and/or preprocessor macros. But this time around we had to tighten things up a bit to ensure ARMv8.4 did not cross-pollinate down into ARMv8.1.
ARMv8.4 is largely untested at the moment. There is no hardware in the field and CI lacks QEMU with the relevant patches/support. We will probably have to revisit some of this stuff in the future.
Since this update applies to ARM gadgets we took the time to expand Android and iOS testing on Travis. Travis now tests more platforms, and includes Autotools and CMake builds, too.
2018-07-15 12:35:14 +00:00
|
|
|
#endif // CRYPTOPP_ARM_SHA2_AVAILABLE
|
2017-08-17 16:33:43 +00:00
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
///////////////////////////////////////////////////////////
|
|
|
|
// end of Walton, Schneiders, O'Rourke and Hovsmith code //
|
|
|
|
///////////////////////////////////////////////////////////
|
2017-08-17 16:33:43 +00:00
|
|
|
|
2017-09-22 12:58:50 +00:00
|
|
|
// ***************** Power8 SHA ********************
|
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
//////////////////////////////////////////////////
|
|
|
|
// start Gustavo, Serra, Scalet and Walton code //
|
|
|
|
//////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#if CRYPTOPP_POWER8_SHA_AVAILABLE
|
|
|
|
|
|
|
|
// Indexes into the S[] array
|
|
|
|
enum {A=0, B=1, C, D, E, F, G, H};
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint32x4_p VecLoad32(const word32* data, int offset)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
|
|
|
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
|
|
|
const uint32x4_p val = VecLoad(offset, data);
|
|
|
|
return (uint32x4_p)VecPermute(val, val, mask);
|
2018-03-10 21:19:11 +00:00
|
|
|
#else
|
2018-11-19 09:50:13 +00:00
|
|
|
return VecLoad(offset, data);
|
2018-03-10 21:19:11 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
template<class T> inline
|
|
|
|
void VecStore32(const T data, word32 dest[4])
|
2018-03-11 16:45:14 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
VecStore(data, dest);
|
2018-03-11 16:45:14 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
|
|
|
// The trick below is due to Andy Polyakov and Jack Lloyd
|
|
|
|
return vec_sel(z,y,x);
|
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
|
|
|
// The trick below is due to Andy Polyakov and Jack Lloyd
|
2018-11-15 20:17:49 +00:00
|
|
|
return vec_sel(y, z, VecXor(x, y));
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint32x4_p Vector_sigma0(const uint32x4_p val)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 07:28:29 +00:00
|
|
|
return VecSHA256<0,0>(val);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint32x4_p Vector_sigma1(const uint32x4_p val)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 07:28:29 +00:00
|
|
|
return VecSHA256<0,0xf>(val);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint32x4_p VectorSigma0(const uint32x4_p val)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 07:28:29 +00:00
|
|
|
return VecSHA256<1,0>(val);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint32x4_p VectorSigma1(const uint32x4_p val)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 07:28:29 +00:00
|
|
|
return VecSHA256<1,0xf>(val);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
|
|
|
|
const uint32x4_p c, const uint32x4_p d)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
|
|
|
|
const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
2018-11-15 20:17:49 +00:00
|
|
|
return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
template <unsigned int R> inline
|
|
|
|
void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
uint32x4_p T1, T2;
|
2018-03-10 21:19:11 +00:00
|
|
|
|
|
|
|
W[R] = M;
|
|
|
|
T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
|
|
|
|
T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
|
|
|
|
|
|
|
|
S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
|
|
|
|
S[E] = S[D] + T1;
|
|
|
|
S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
|
|
|
|
S[A] = T1 + T2;
|
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
template <unsigned int R> inline
|
|
|
|
void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
|
|
|
// Indexes into the W[] array
|
|
|
|
enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
|
|
|
|
const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
|
2018-03-10 21:19:11 +00:00
|
|
|
T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
|
2018-11-19 09:50:13 +00:00
|
|
|
uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
|
|
|
S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
|
|
|
|
S[E] = S[D] + T1;
|
|
|
|
S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
|
|
|
|
S[A] = T1 + T2;
|
|
|
|
}
|
|
|
|
|
2017-09-22 12:58:50 +00:00
|
|
|
void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
|
|
|
|
{
|
2018-03-12 01:45:19 +00:00
|
|
|
CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
|
2017-09-22 12:58:50 +00:00
|
|
|
CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
|
2018-03-12 01:45:19 +00:00
|
|
|
CRYPTOPP_UNUSED(order);
|
2017-09-22 12:58:50 +00:00
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
|
|
|
|
const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
uint32x4_p abcd = VecLoad(state+0);
|
|
|
|
uint32x4_p efgh = VecLoad(state+4);
|
|
|
|
uint32x4_p W[16], S[8], vm, vk;
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-03-11 16:45:14 +00:00
|
|
|
size_t blocks = length / SHA256::BLOCKSIZE;
|
|
|
|
while (blocks--)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-10-28 08:45:26 +00:00
|
|
|
unsigned int offset=0;
|
2018-03-11 16:45:14 +00:00
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
S[A] = abcd; S[E] = efgh;
|
2018-11-15 20:17:49 +00:00
|
|
|
S[B] = VecShiftLeftOctet<4>(S[A]);
|
|
|
|
S[F] = VecShiftLeftOctet<4>(S[E]);
|
|
|
|
S[C] = VecShiftLeftOctet<4>(S[B]);
|
|
|
|
S[G] = VecShiftLeftOctet<4>(S[F]);
|
|
|
|
S[D] = VecShiftLeftOctet<4>(S[C]);
|
|
|
|
S[H] = VecShiftLeftOctet<4>(S[G]);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-10-28 08:45:26 +00:00
|
|
|
// Rounds 0-16
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad32(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<0>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<1>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<2>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<3>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad32(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<4>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<5>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<6>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<7>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad32(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<8>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<9>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<10>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<11>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad32(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<12>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<13>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<14>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<4>(vk);
|
|
|
|
vm = VecShiftLeftOctet<4>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA256_ROUND1<15>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
|
|
|
m += 16; // 32-bit words, not bytes
|
|
|
|
|
2018-10-28 08:45:26 +00:00
|
|
|
// Rounds 16-64
|
|
|
|
for (unsigned int i=16; i<64; i+=16)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA256_ROUND2<0>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
|
|
|
|
SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
|
|
|
|
SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA256_ROUND2<4>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
|
|
|
|
SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
|
|
|
|
SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA256_ROUND2<8>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
|
|
|
|
SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
|
|
|
|
SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA256_ROUND2<12>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
|
|
|
|
SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
|
|
|
|
SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
}
|
|
|
|
|
|
|
|
abcd += VectorPack(S[A],S[B],S[C],S[D]);
|
|
|
|
efgh += VectorPack(S[E],S[F],S[G],S[H]);
|
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
VecStore32(abcd, state+0);
|
|
|
|
VecStore32(efgh, state+4);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
void VecStore64(const uint64x2_p val, word64* data)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
VecStore(val, data);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint64x2_p VecLoad64(const word64* data, int offset)
|
2018-03-11 16:45:14 +00:00
|
|
|
{
|
2018-10-28 08:24:22 +00:00
|
|
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
2018-11-19 09:50:13 +00:00
|
|
|
const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
|
|
|
|
return VecPermute(VecLoad(offset, data), mask);
|
2018-03-11 16:45:14 +00:00
|
|
|
#else
|
2018-11-19 09:50:13 +00:00
|
|
|
return VecLoad(offset, data);
|
2018-03-11 16:45:14 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
|
|
|
// The trick below is due to Andy Polyakov and Jack Lloyd
|
|
|
|
return vec_sel(z,y,x);
|
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
|
|
|
// The trick below is due to Andy Polyakov and Jack Lloyd
|
2018-11-15 20:17:49 +00:00
|
|
|
return vec_sel(y, z, VecXor(x, y));
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint64x2_p Vector_sigma0(const uint64x2_p val)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 07:28:29 +00:00
|
|
|
return VecSHA512<0,0>(val);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint64x2_p Vector_sigma1(const uint64x2_p val)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 07:28:29 +00:00
|
|
|
return VecSHA512<0,0xf>(val);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint64x2_p VectorSigma0(const uint64x2_p val)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 07:28:29 +00:00
|
|
|
return VecSHA512<1,0>(val);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint64x2_p VectorSigma1(const uint64x2_p val)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 07:28:29 +00:00
|
|
|
return VecSHA512<1,0xf>(val);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
inline
|
|
|
|
uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
2018-11-15 20:17:49 +00:00
|
|
|
return VecPermute(x,y,m);
|
2018-03-10 21:19:11 +00:00
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
template <unsigned int R> inline
|
|
|
|
void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
uint64x2_p T1, T2;
|
2018-03-10 21:19:11 +00:00
|
|
|
|
|
|
|
W[R] = M;
|
|
|
|
T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
|
|
|
|
T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
|
|
|
|
|
|
|
|
S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
|
|
|
|
S[E] = S[D] + T1;
|
|
|
|
S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
|
|
|
|
S[A] = T1 + T2;
|
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
template <unsigned int R> inline
|
|
|
|
void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
|
|
|
// Indexes into the W[] array
|
|
|
|
enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
|
|
|
|
const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
|
2018-03-10 21:19:11 +00:00
|
|
|
T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
|
2018-11-19 09:50:13 +00:00
|
|
|
uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
|
|
|
S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
|
|
|
|
S[E] = S[D] + T1;
|
|
|
|
S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
|
|
|
|
S[A] = T1 + T2;
|
2017-09-22 12:58:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
|
|
|
|
{
|
2018-03-12 01:45:19 +00:00
|
|
|
CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
|
2017-09-22 12:58:50 +00:00
|
|
|
CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
|
2018-03-12 01:45:19 +00:00
|
|
|
CRYPTOPP_UNUSED(order);
|
2017-09-22 13:39:36 +00:00
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
|
|
|
|
const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
uint64x2_p ab = VecLoad(state+0);
|
|
|
|
uint64x2_p cd = VecLoad(state+2);
|
|
|
|
uint64x2_p ef = VecLoad(state+4);
|
|
|
|
uint64x2_p gh = VecLoad(state+6);
|
|
|
|
uint64x2_p W[16], S[8], vm, vk;
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-03-11 16:45:14 +00:00
|
|
|
size_t blocks = length / SHA512::BLOCKSIZE;
|
|
|
|
while (blocks--)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-10-28 08:45:26 +00:00
|
|
|
unsigned int offset=0;
|
2018-03-11 16:45:14 +00:00
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
S[A] = ab; S[C] = cd;
|
|
|
|
S[E] = ef; S[G] = gh;
|
2018-11-15 20:17:49 +00:00
|
|
|
S[B] = VecShiftLeftOctet<8>(S[A]);
|
|
|
|
S[D] = VecShiftLeftOctet<8>(S[C]);
|
|
|
|
S[F] = VecShiftLeftOctet<8>(S[E]);
|
|
|
|
S[H] = VecShiftLeftOctet<8>(S[G]);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
2018-10-28 08:45:26 +00:00
|
|
|
// Rounds 0-16
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad64(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<0>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<8>(vk);
|
|
|
|
vm = VecShiftLeftOctet<8>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<1>(W,S, vk,vm);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad64(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<2>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<8>(vk);
|
|
|
|
vm = VecShiftLeftOctet<8>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<3>(W,S, vk,vm);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad64(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<4>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<8>(vk);
|
|
|
|
vm = VecShiftLeftOctet<8>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<5>(W,S, vk,vm);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad64(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<6>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<8>(vk);
|
|
|
|
vm = VecShiftLeftOctet<8>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<7>(W,S, vk,vm);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad64(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<8>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<8>(vk);
|
|
|
|
vm = VecShiftLeftOctet<8>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<9>(W,S, vk,vm);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad64(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<10>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<8>(vk);
|
|
|
|
vm = VecShiftLeftOctet<8>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<11>(W,S, vk,vm);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad64(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<12>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<8>(vk);
|
|
|
|
vm = VecShiftLeftOctet<8>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<13>(W,S, vk,vm);
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
|
|
|
vm = VecLoad64(m, offset);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<14>(W,S, vk,vm);
|
|
|
|
offset+=16;
|
|
|
|
|
2018-11-15 20:17:49 +00:00
|
|
|
vk = VecShiftLeftOctet<8>(vk);
|
|
|
|
vm = VecShiftLeftOctet<8>(vm);
|
2018-10-28 08:45:26 +00:00
|
|
|
SHA512_ROUND1<15>(W,S, vk,vm);
|
2018-03-10 21:19:11 +00:00
|
|
|
|
|
|
|
m += 16; // 64-bit words, not bytes
|
|
|
|
|
2018-10-28 08:45:26 +00:00
|
|
|
// Rounds 16-80
|
|
|
|
for (unsigned int i=16; i<80; i+=16)
|
2018-03-10 21:19:11 +00:00
|
|
|
{
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA512_ROUND2<0>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA512_ROUND2<2>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA512_ROUND2<4>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA512_ROUND2<6>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA512_ROUND2<8>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA512_ROUND2<10>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA512_ROUND2<12>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
vk = VecLoad(offset, k);
|
2018-03-10 21:19:11 +00:00
|
|
|
SHA512_ROUND2<14>(W,S, vk);
|
2018-11-15 20:17:49 +00:00
|
|
|
SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
|
2018-03-10 21:19:11 +00:00
|
|
|
offset+=16;
|
|
|
|
}
|
|
|
|
|
|
|
|
ab += VectorPack(S[A],S[B]);
|
|
|
|
cd += VectorPack(S[C],S[D]);
|
|
|
|
ef += VectorPack(S[E],S[F]);
|
|
|
|
gh += VectorPack(S[G],S[H]);
|
|
|
|
}
|
|
|
|
|
2018-11-19 09:50:13 +00:00
|
|
|
VecStore64(ab, state+0);
|
|
|
|
VecStore64(cd, state+2);
|
|
|
|
VecStore64(ef, state+4);
|
|
|
|
VecStore64(gh, state+6);
|
2017-09-22 12:58:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif // CRYPTOPP_POWER8_SHA_AVAILABLE
|
|
|
|
|
2018-03-10 21:19:11 +00:00
|
|
|
////////////////////////////////////////////////
|
|
|
|
// end Gustavo, Serra, Scalet and Walton code //
|
|
|
|
////////////////////////////////////////////////
|
2017-09-22 12:58:50 +00:00
|
|
|
|
|
|
|
NAMESPACE_END
|