Add ARM SHA512 asm implementation from Cryptogams (GH #841, PR #843)

Cryptogams is Andy Polyakov's project used to create high speed crypto algorithms and share them with other developers. Cryptogams  has a dual license. First is the OpenSSL license because Andy contributes to OpenSSL. Second is a BSD license for those who want a more permissive license.

Andy's implementation runs about 45% faster than C/C++ code. Testing on a 1.8 GHz Cortex-A17 shows Cryptograms at 45 cpb, and C++ at 79 cpb.

The integration instructions are documented at [Cryptogams SHA](https://wiki.openssl.org/index.php/Cryptogams_SHA) on the OpenSSL wiki.
This commit is contained in:
Jeffrey Walton 2019-05-19 16:29:45 -04:00 committed by GitHub
parent 4c9ca6b723
commit d38e5a954d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 2013 additions and 43 deletions

View File

@ -318,6 +318,8 @@ sha1_armv4.h
sha1_armv4.S
sha256_armv4.h
sha256_armv4.S
sha512_armv4.h
sha512_armv4.S
sha3.cpp
sha3.h
shacal2.cpp

View File

@ -1059,7 +1059,7 @@ endif
ifeq ($(IS_ARM32),1)
CRYPTOGAMS_ARCH_FLAG = -march=armv7-a
CRYPTOGAMS_ARCH_FLAG += -Wa,--noexecstack
SRCS += aes_armv4.S sha1_armv4.S sha256_armv4.S
SRCS += aes_armv4.S sha1_armv4.S sha256_armv4.S sha512_armv4.S
endif
# List cryptlib.cpp first, then cpu.cpp, then integer.cpp to tame C++ static initialization problems.
@ -1513,6 +1513,10 @@ sha1_armv4.o : sha1_armv4.S
sha256_armv4.o : sha256_armv4.S
$(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $<
# Cryptogams ARM asm implementation.
sha512_armv4.o : sha512_armv4.S
$(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $<
sha3_simd.o : sha3_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA3_FLAG) -c) $<

View File

@ -594,7 +594,7 @@ ifeq ($(IS_ARM32),1)
ifneq ($(IS_IOS),1)
CRYPTOGAMS_ARCH_FLAG = -march=armv7-a
CRYPTOGAMS_ARCH_FLAG += -Wa,--noexecstack
SRCS += aes_armv4.S sha1_armv4.S sha256_armv4.S
SRCS += aes_armv4.S sha1_armv4.S sha256_armv4.S sha512_armv4.S
endif
endif
@ -873,6 +873,10 @@ sha1_armv4.o : sha1_armv4.S
sha256_armv4.o : sha256_armv4.S
$(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $<
# Cryptogams ARM asm implementation.
sha512_armv4.o : sha512_armv4.S
$(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $<
# SSE4.2/SHA-NI or ARMv8a available
shacal2_simd.o : shacal2_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<

View File

@ -366,6 +366,7 @@
# define CRYPTOGAMS_ARM_AES 1
# define CRYPTOGAMS_ARM_SHA1 1
# define CRYPTOGAMS_ARM_SHA256 1
# define CRYPTOGAMS_ARM_SHA512 1
# endif
#endif

139
sha.cpp
View File

@ -1,29 +1,38 @@
// sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
// implemented Intel SHA extensions based on Intel articles and code by
// Sean Gulley. Jeffrey Walton implemented ARM SHA based on ARM code and
// code from Johannes Schneiders, Skip Hovsmith and Barry O'Rourke.
// All code is in the public domain.
// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey
// Walton implemented Intel SHA extensions based on Intel articles and code
// by Sean Gulley. Jeffrey Walton implemented ARM SHA based on ARM code and
// code from Johannes Schneiders, Skip Hovsmith and Barry O'Rourke. All
// code is in the public domain.
// In August 2017 JW reworked the internals to align all the implementations.
// Formerly all hashes were software based, IterHashBase handled endian conversions,
// and IterHashBase dispatched a single to block SHA{N}::Transform. SHA{N}::Transform
// then performed the single block hashing. It was repeated for multiple blocks.
// In August 2017 JW reworked the internals to align all the
// implementations. Formerly all hashes were software based, IterHashBase
// handled endian conversions, and IterHashBase dispatched a single to
// block SHA{N}::Transform. SHA{N}::Transform then performed the single
// block hashing. It was repeated for multiple blocks.
//
// The rework added SHA{N}::HashMultipleBlocks (class) and SHA{N}_HashMultipleBlocks
// (free standing). There are also hardware accelerated variations. Callers enter
// SHA{N}::HashMultipleBlocks (class), and the function calls SHA{N}_HashMultipleBlocks
// (free standing) or SHA{N}_HashBlock (free standing) as a fallback.
// The rework added SHA{N}::HashMultipleBlocks (class) and
// SHA{N}_HashMultipleBlocks (free standing). There are also hardware
// accelerated variations. Callers enter SHA{N}::HashMultipleBlocks (class)
// and the function calls SHA{N}_HashMultipleBlocks (free standing) or
// SHA{N}_HashBlock (free standing) as a fallback.
//
// An added wrinkle is hardware is little endian, C++ is big endian, and callers use
// big endian, so SHA{N}_HashMultipleBlock accepts a ByteOrder for the incoming data
// arrangement. Hardware based SHA{N}_HashMultipleBlock can often perform the endian
// swap much easier by setting an EPI mask. Endian swap incurs no penalty on Intel SHA,
// and 4-instruction penalty on ARM SHA. Under C++ the full software based swap penalty
// is incurred due to use of ReverseBytes().
// An added wrinkle is hardware is little endian, C++ is big endian, and
// callers use big endian, so SHA{N}_HashMultipleBlock accepts a ByteOrder
// for the incoming data arrangement. Hardware based SHA{N}_HashMultipleBlock
// can often perform the endian swap much easier by setting an EPI mask.
// Endian swap incurs no penalty on Intel SHA, and 4-instruction penalty on
// ARM SHA. Under C++ the full software based swap penalty is incurred due
// to use of ReverseBytes().
//
// The rework also removed the hacked-in pointers to implementations.
// In May 2019 JW added Cryptogams ARMv7 and NEON implementations for SHA1,
// SHA256 and SHA512. The Cryptogams code closed a performance gap on modern
// 32-bit ARM devices. Cryptogams is Andy Polyakov's project used to create
// high speed crypto algorithms and share them with other developers. Andy's
// code runs 30% to 50% faster than C/C++ code. The Cryptogams code can be
// disabled in config_asm.h. An example of integrating Andy's code is at
// https://wiki.openssl.org/index.php/Cryptogams_SHA.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
@ -56,7 +65,7 @@ extern void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, siz
extern void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order);
#endif
#if (CRYPTOGAMS_ARM_SHA1)
#if CRYPTOGAMS_ARM_SHA1
extern "C" unsigned int CRYPTOGAMS_armcaps;
extern "C" int sha1_block_data_order(word32* state, const word32 *data, size_t blocks);
#endif
@ -69,7 +78,7 @@ extern void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, siz
extern void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order);
#endif
#if (CRYPTOGAMS_ARM_SHA256)
#if CRYPTOGAMS_ARM_SHA256
extern "C" unsigned int CRYPTOGAMS_armcaps;
extern "C" int sha256_block_data_order(word32* state, const word32 *data, size_t blocks);
#endif
@ -83,6 +92,11 @@ extern void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data,
extern void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order);
#endif
#if CRYPTOGAMS_ARM_SHA512
extern "C" unsigned int CRYPTOGAMS_armcaps;
extern "C" int sha512_block_data_order(word64* state, const word64 *data, size_t blocks);
#endif
// We add extern to export table to sha_simd.cpp, but it
// cleared http://github.com/weidai11/cryptopp/issues/502
extern const word32 SHA256_K[64];
@ -153,6 +167,23 @@ const word32 SHA256_K[64] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
ANONYMOUS_NAMESPACE_BEGIN
#if CRYPTOGAMS_ARM_SHA1 || CRYPTOGAMS_ARM_SHA256 || CRYPTOGAMS_ARM_SHA512
inline bool CryptogamsArmCaps()
{
// The Cryptogams code uses a global variable named CRYPTOGAMS_armcaps
// for capabilities like ARMv7 and NEON. Storage is allocated in the
// module. We still need to set CRYPTOGAMS_armcaps accordingly.
// The Cryptogams code defines NEON as 1<<0; see ARMV7_NEON.
*const_cast<volatile unsigned int*>(&CRYPTOGAMS_armcaps) = CryptoPP::HasNEON() ? (1<<0) : 0;
return true;
}
#endif
ANONYMOUS_NAMESPACE_END
////////////////////////////////
// start of Steve Reid's code //
////////////////////////////////
@ -276,11 +307,19 @@ void SHA1::Transform(word32 *state, const word32 *data)
return;
}
#endif
#if CRYPTOGAMS_ARM_SHA1 && 0
// TODO: convert LE to BE and use Cryptogams code
#if CRYPTOGAMS_ARM_SHA1
if (HasARMv7())
{
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
# if defined(CRYPTOPP_LITTLE_ENDIAN)
word32 dataBuf[16];
ByteReverse(dataBuf, data, SHA1::BLOCKSIZE);
sha1_block_data_order(state, data, 1);
# else
sha1_block_data_order(state, data, 1);
# endif
return;
}
#endif
@ -310,11 +349,7 @@ size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length)
#if CRYPTOGAMS_ARM_SHA1
if (HasARMv7())
{
// The Cryptogams code uses a global variable named CRYPTOGAMS_armcaps
// for capabilities like ARMv7 and NEON. Storage is allocated in the
// module. We still need to set CRYPTOGAMS_armcaps accordingly.
// The Cryptogams code defines NEON as 1<<0; see ARMV7_NEON.
static const unsigned int unused = CRYPTOGAMS_armcaps = HasNEON() ? (1<<0) : 0;
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
sha1_block_data_order(m_state, input, length / SHA1::BLOCKSIZE);
@ -823,11 +858,19 @@ void SHA256::Transform(word32 *state, const word32 *data)
return;
}
#endif
#if CRYPTOGAMS_ARM_SHA256 && 0
// TODO: convert LE to BE and use Cryptogams code
#if CRYPTOGAMS_ARM_SHA256
if (HasARMv7())
{
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
# if defined(CRYPTOPP_LITTLE_ENDIAN)
word32 dataBuf[16];
ByteReverse(dataBuf, data, SHA256::BLOCKSIZE);
sha256_block_data_order(state, data, 1);
# else
sha256_block_data_order(state, data, 1);
# endif
return;
}
#endif
@ -872,11 +915,7 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
#if CRYPTOGAMS_ARM_SHA256
if (HasARMv7())
{
// The Cryptogams code uses a global variable named CRYPTOGAMS_armcaps
// for capabilities like ARMv7 and NEON. Storage is allocated in the
// module. We still need to set CRYPTOGAMS_armcaps accordingly.
// The Cryptogams code defines NEON as 1<<0; see ARMV7_NEON.
static const unsigned int unused = CRYPTOGAMS_armcaps = HasNEON() ? (1<<0) : 0;
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
sha256_block_data_order(m_state, input, length / SHA256::BLOCKSIZE);
@ -942,11 +981,7 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
#if CRYPTOGAMS_ARM_SHA256
if (HasARMv7())
{
// The Cryptogams code uses a global variable named CRYPTOGAMS_armcaps
// for capabilities like ARMv7 and NEON. Storage is allocated in the
// module. We still need to set CRYPTOGAMS_armcaps accordingly.
// The Cryptogams code defines NEON as 1<<0; see ARMV7_NEON.
static const unsigned int unused = CRYPTOGAMS_armcaps = HasNEON() ? (1<<0) : 0;
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
sha256_block_data_order(m_state, input, length / SHA256::BLOCKSIZE);
@ -997,6 +1032,12 @@ std::string SHA512_AlgorithmProvider()
if (HasSSE2())
return "SSE2";
#endif
#if CRYPTOGAMS_ARM_SHA512
if (HasNEON())
return "NEON";
if (HasARMv7())
return "ARMv7";
#endif
#if (CRYPTOPP_POWER8_SHA_AVAILABLE)
if (HasSHA512())
return "Power8";
@ -1303,6 +1344,22 @@ void SHA512::Transform(word64 *state, const word64 *data)
return;
}
#endif
#if CRYPTOGAMS_ARM_SHA512
if (HasARMv7())
{
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
# if defined(CRYPTOPP_LITTLE_ENDIAN)
word64 dataBuf[16];
ByteReverse(dataBuf, data, SHA512::BLOCKSIZE);
sha512_block_data_order(state, dataBuf, 1);
# else
sha512_block_data_order(state, data, 1);
# endif
return;
}
#endif
#if CRYPTOPP_POWER8_SHA_AVAILABLE
if (HasSHA512())
{

1881
sha512_armv4.S Normal file

File diff suppressed because it is too large Load Diff

21
sha512_armv4.h Normal file
View File

@ -0,0 +1,21 @@
/* Header file for use with Cryptogam's ARMv4 SHA512. */
/* Also see http://www.openssl.org/~appro/cryptogams/ */
/* https://wiki.openssl.org/index.php/Cryptogams_SHA. */
#ifndef CRYPTOGAMS_SHA512_ARMV4_H
#define CRYPTOGAMS_SHA512_ARMV4_H
#ifdef __cplusplus
extern "C" {
#endif
void sha512_block_data_order(void *state, const void *data, size_t blocks);
/* Cryptogams arm caps */
#define ARMV7_NEON (1<<0)
#ifdef __cplusplus
}
#endif
#endif /* CRYPTOGAMS_SHA512_ARMV4_H */