Split source files to support Base Implementation + SIMD implementation (GH #461)

Split source files to support Base Implementation + SIMD implementation
This commit is contained in:
Jeffrey Walton 2017-08-17 12:33:43 -04:00 committed by GitHub
parent 74d21068a1
commit e2c377effd
38 changed files with 6626 additions and 6742 deletions

View File

@ -3,7 +3,7 @@
# I have to admit its a bit complex and I don't fully understand it.
version: 1.0.{build}
clone_depth: 1
clone_depth: 3
skip_tags: true
configuration:

View File

@ -10,7 +10,9 @@ algparam.cpp
algparam.h
arc4.cpp
arc4.h
ariatab.cpp
aria.cpp
aria-simd.cpp
aria.h
argnames.h
asn.cpp
@ -29,6 +31,7 @@ bench1.cpp
bench2.cpp
bfinit.cpp
blake2.cpp
blake2-simd.cpp
blake2.h
blowfish.cpp
blowfish.h
@ -53,6 +56,7 @@ config.h
cpu.cpp
cpu.h
crc.cpp
crc-simd.cpp
crc.h
cryptdll.vcxproj
cryptdll.vcxproj.filters
@ -177,6 +181,7 @@ mqv.cpp
mqv.h
nbtheory.cpp
nbtheory.h
neon.cpp
network.cpp
network.h
nr.h
@ -225,6 +230,7 @@ regtest2.cpp
regtest3.cpp
resource.h
rijndael.cpp
rijndael-simd.cpp
rijndael.h
ripemd.cpp
ripemd.h
@ -248,10 +254,12 @@ serpent.cpp
serpent.h
serpentp.h
sha.cpp
sha-simd.cpp
sha.h
sha3.cpp
sha3.h
shacal2.cpp
shacal2-simd.cpp
shacal2.h
shark.cpp
shark.h

View File

@ -2,6 +2,12 @@
##### System Attributes and Programs #####
###########################################################
# If needed
TMPDIR ?= /tmp
# Used for ARMv7 and NEON.
FP_ABI ?= hard
# Command ard arguments
AR ?= ar
ARFLAGS ?= -cr # ar needs the dash on OpenBSD
RANLIB ?= ranlib
@ -19,8 +25,10 @@ UNAME := $(shell uname)
IS_X86 := $(shell uname -m | $(EGREP) -v "x86_64" | $(EGREP) -i -c "i.86|x86|i86")
IS_X64 := $(shell uname -m | $(EGREP) -i -c "(_64|d64)")
IS_PPC := $(shell uname -m | $(EGREP) -i -c "ppc|power")
IS_ARM32 := $(shell uname -m | $(EGREP) -i -c "arm")
IS_ARM32 := $(shell uname -m | $(EGREP) -v "arm64" | $(EGREP) -i -c "arm")
IS_ARM64 := $(shell uname -m | $(EGREP) -i -c "aarch64")
IS_ARMV8 ?= $(shell uname -m | $(EGREP) -i -c 'aarch32|aarch64')
IS_NEON ?= $(shell uname -m | $(EGREP) -i -c 'armv7|armv8|aarch32|aarch64')
IS_SPARC := $(shell uname -m | $(EGREP) -i -c "sparc")
IS_SPARC64 := $(shell uname -m | $(EGREP) -i -c "sparc64")
@ -191,6 +199,21 @@ endif # -DCRYPTOPP_DISABLE_SSSE3
endif # -DCRYPTOPP_DISABLE_ASM
endif # CXXFLAGS
SSSE3_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -mssse3 -dM -E - 2>/dev/null | grep -i -c -q __SSSE3__ && echo "-mssse3")
ARIA_FLAG = $(SSSE3_FLAG)
ifeq ($(findstring -DCRYPTOPP_DISABLE_SSE4,$(CXXFLAGS)),)
SSE42_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.2 -dM -E - 2>/dev/null | grep -i -c -q __SSE4_2__ && echo "-msse4.2")
ifeq ($(findstring -DCRYPTOPP_DISABLE_AESNI,$(CXXFLAGS)),)
GCM_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -mssse3 -mpclmul -dM -E - 2>/dev/null | grep -i -c -q __PCLMUL__ && echo "-mssse3 -mpclmul")
AES_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.1 -maes -dM -E - 2>/dev/null | grep -i -c -q __AES__ && echo "-msse4.1 -maes")
ifeq ($(findstring -DCRYPTOPP_DISABLE_SHA,$(CXXFLAGS)),)
SHA_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.2 -msha -dM -E - 2>/dev/null | grep -i -c -q __SHA__ && echo "-msse4.2 -msha")
BLAKE2_FLAG = $(SSE42_FLAG)
CRC_FLAG = $(SSE42_FLAG)
endif
endif
endif
# BEGIN_NATIVE_ARCH
# Guard use of -march=native (or -m{32|64} on some platforms)
# Don't add anything if -march=XXX or -mtune=XXX is specified
@ -280,6 +303,26 @@ CXXFLAGS += -pipe
endif
endif
ifeq ($(IS_NEON),1)
NEON_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon -dM -E - 2>/dev/null | grep -i -c -q __ARM_NEON && echo "-march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon")
GCM_FLAG = $(NEON_FLAG)
ARIA_FLAG = $(NEON_FLAG)
BLAKE2_FLAG = $(NEON_FLAG)
endif
ifeq ($(IS_ARMV8),1)
ARMV8A_NEON_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a -dM -E - 2>/dev/null | grep -i -c -q __ARM_NEON && echo "-march=armv8-a")
ARMV8A_CRC_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | grep -i -c -q __ARM_FEATURE_CRC32 && echo "-march=armv8-a+crc")
ARMV8A_CRYPTO_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crypto -dM -E - 2>/dev/null | grep -i -c -q __ARM_FEATURE_CRYPTO && echo "-march=armv8-a+crypto")
CRC_FLAG = $(ARMV8A_CRC_FLAG)
AES_FLAG = $(ARMV8A_CRYPTO_FLAG)
GCM_FLAG = $(ARMV8A_CRYPTO_FLAG)
SHA_FLAG = $(ARMV8A_CRYPTO_FLAG)
ARIA_FLAG = $(ARMV8A_NEON_FLAG)
BLAKE2_FLAG = $(ARMV8A_NEON_FLAG)
NEON_FLAG = $(ARMV8A_NEON_FLAG)
endif
endif # IS_X86
###########################################################
@ -287,7 +330,7 @@ endif # IS_X86
###########################################################
# For SunOS, create a Mapfile that allows our object files
# to cantain additional bits (like SSE4 and AES on old Xeon)
# to contain additional bits (like SSE4 and AES on old Xeon)
# http://www.oracle.com/technetwork/server-storage/solaris/hwcap-modification-139536.html
ifeq ($(IS_SUN)$(SUN_COMPILER),11)
ifneq ($(IS_X86)$(IS_X32)$(IS_X64),000)
@ -527,12 +570,13 @@ endif
endif # Nasm
# List test.cpp first to tame C++ static initialization problems.
TESTSRCS := adhoc.cpp test.cpp bench1.cpp bench2.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp
TESTSRCS := adhoc.cpp test.cpp bench1.cpp bench2.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp dlltest.cpp fipsalgt.cpp
TESTOBJS := $(TESTSRCS:.cpp=.o)
LIBOBJS := $(filter-out $(TESTOBJS),$(OBJS))
# List cryptlib.cpp first, then cpu.cpp, then integer.cpp to tame C++ static initialization problems.
DLLSRCS := cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp network.cpp oaep.cpp ospstore.cpp osrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
DLLSRCS := cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon.cpp network.cpp oaep.cpp ospstore.cpp osrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
DLLOBJS := $(DLLSRCS:.cpp=.export.o)
# Import lib testing
@ -555,8 +599,8 @@ static: libcryptopp.a
shared dynamic: libcryptopp.so$(SOLIB_VERSION_SUFFIX)
endif
.PHONY: deps
deps GNUmakefile.deps:
.PHONY: dep deps depend
dep deps depend GNUmakefile.deps:
$(CXX) $(strip $(CXXFLAGS)) -MM *.cpp > GNUmakefile.deps
# CXXFLAGS are tuned earlier.
@ -752,13 +796,8 @@ ifeq ($(wildcard Filelist.txt),Filelist.txt)
DIST_FILES := $(shell cat Filelist.txt)
endif
.PHONY: appveyor
appveyor:
sed 's|Toolset>v100|Toolset>$$(DefaultPlatformToolset)|g' cryptlib.vcxproj > TestScripts/cryptlib.vcxproj
sed 's|Toolset>v100|Toolset>$$(DefaultPlatformToolset)|g' cryptest.vcxproj > TestScripts/cryptest.vcxproj
.PHONY: trim
trim: appveyor
trim:
ifneq ($(IS_DARWIN),0)
sed -i '' -e's/[[:space:]]*$$//' *.sh .*.yml *.h *.cpp *.asm *.s *.sln *.vcxproj *.filters GNUmakefile GNUmakefile-cross
sed -i '' -e's/[[:space:]]*$$//' TestData/*.dat TestVectors/*.txt TestScripts/*.*
@ -823,58 +862,90 @@ endif # Dependencies
# Run rdrand-nasm.sh to create the object files
ifeq ($(USE_NASM),1)
rdrand.o: rdrand.h rdrand.cpp rdrand.s
$(CXX) $(strip $(CXXFLAGS)) -DNASM_RDRAND_ASM_AVAILABLE=1 -DNASM_RDSEED_ASM_AVAILABLE=1 -c rdrand.cpp
$(CXX) $(strip $(CXXFLAGS) -DNASM_RDRAND_ASM_AVAILABLE=1 -DNASM_RDSEED_ASM_AVAILABLE=1 -c rdrand.cpp)
rdrand-%.o:
./rdrand-nasm.sh
endif
# SSE4.2 or NEON available
aria-simd.o : aria-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(ARIA_FLAG) -c) $<
# SSE4.2 or NEON available
neon.o : neon.cpp
$(CXX) $(strip $(CXXFLAGS) $(NEON_FLAG) -c) $<
# SSE4.2 or ARMv8a available
blake2-simd.o : blake2-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(BLAKE2_FLAG) -c) $<
# SSE4.2 or ARMv8a available
crc-simd.o : crc-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(CRC_FLAG) -c) $<
# PCLMUL or ARMv7a/ARMv8a available
gcm-simd.o : gcm-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<
# AESNI or ARMv7a/ARMv8a available
rijndael-simd.o : rijndael-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(AES_FLAG) -c) $<
# SSE4.2/SHA-NI or ARMv8a available
sha-simd.o : sha-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
# SSE4.2/SHA-NI or ARMv8a available
shacal2-simd.o : shacal2-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
# Don't build Threefish with UBsan on Travis CI. Timeouts cause the build to fail.
# Also see https://stackoverflow.com/q/12983137/608639.
ifeq ($(findstring true,$(CI)),true)
threefish.o : threefish.cpp
$(CXX) $(strip $(subst -fsanitize=undefined,,$(CXXFLAGS))) -c $<
$(CXX) $(strip $(subst -fsanitize=undefined,,$(CXXFLAGS)) -c) $<
endif
# Don't build Rijndael with UBsan. Too much noise due to unaligned data accesses.
ifneq ($(findstring -fsanitize=undefined,$(CXXFLAGS)),)
rijndael.o : rijndael.cpp
$(CXX) $(strip $(subst -fsanitize=undefined,,$(CXXFLAGS))) -c $<
$(CXX) $(strip $(subst -fsanitize=undefined,,$(CXXFLAGS)) -c) $<
endif
# Don't build VMAC and friends with Asan. Too many false positives.
ifneq ($(findstring -fsanitize=address,$(CXXFLAGS)),)
vmac.o : vmac.cpp
$(CXX) $(strip $(subst -fsanitize=address,,$(CXXFLAGS))) -c $<
$(CXX) $(strip $(subst -fsanitize=address,,$(CXXFLAGS)) -c) $<
endif
# Only use CRYPTOPP_DATA_DIR if its not set in CXXFLAGS
ifeq ($(findstring -DCRYPTOPP_DATA_DIR, $(strip $(CXXFLAGS))),)
ifneq ($(strip $(CRYPTOPP_DATA_DIR)),)
validat%.o : validat%.cpp
$(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c $<
$(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c) $<
bench%.o : bench%.cpp
$(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c $<
$(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c) $<
datatest.o : datatest.cpp
$(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c $<
$(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c) $<
test.o : test.cpp
$(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c $<
$(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c) $<
endif
endif
%.dllonly.o : %.cpp
$(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DLL_ONLY -c $< -o $@
$(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DLL_ONLY -c) $< -o $@
%.import.o : %.cpp
$(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_IMPORTS -c $< -o $@
$(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_IMPORTS -c) $< -o $@
%.export.o : %.cpp
$(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_EXPORTS -c $< -o $@
$(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_EXPORTS -c) $< -o $@
%.bc : %.cpp
$(CXX) $(strip $(CXXFLAGS)) -c $<
$(CXX) $(strip $(CXXFLAGS) -c) $<
%.o : %.cpp
$(CXX) $(strip $(CXXFLAGS)) -c $<
$(CXX) $(strip $(CXXFLAGS) -c) $<
.PHONY: so_warning
so_warning:

0
TestScripts/change-version.sh Normal file → Executable file
View File

View File

@ -1171,37 +1171,75 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo
OBJFILE=sha.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
X86_SSE2=$(echo -n "$X86_CPU_FLAGS" | "$GREP" -i -c sse2)
X86_SHA256_HASH_BLOCKS=$(echo -n "$DISASS_TEXT" | "$EGREP" -c 'X86_SHA256_HashBlocks')
X86_SHA256_HASH_BLOCKS=$(echo -n "$DISASS_TEXT" | "$EGREP" -c 'SHA256_HashMultipleBlocks_SSE2')
if [[ ("$X86_SHA256_HASH_BLOCKS" -ne "0") ]]; then
COUNT=$(echo -n "$DISASS_TEXT" | "$EGREP" -i -c '(rol.*0x|ror.*0x)')
if [[ ("$COUNT" -le "600") ]]; then
if [[ ("$COUNT" -le "250") ]]; then
FAILED=1
echo "ERROR: failed to generate rotate immediate instruction (X86_SHA256_HashBlocks)" | tee -a "$TEST_RESULTS"
echo "ERROR: failed to generate rotate immediate instruction (SHA256_HashMultipleBlocks_SSE2)" | tee -a "$TEST_RESULTS"
fi
else
COUNT=$(echo -n "$DISASS_TEXT" | "$EGREP" -i -c '(rol.*0x|ror.*0x)')
if [[ ("$COUNT" -le "1000") ]]; then
if [[ ("$COUNT" -le "500") ]]; then
FAILED=1
echo "ERROR: failed to generate rotate immediate instruction" | tee -a "$TEST_RESULTS"
fi
fi
if [[ ("$X86_SSE2" -ne "0" && "$X86_SHA256_HASH_BLOCKS" -eq "0") ]]; then
echo "ERROR: failed to use X86_SHA256_HashBlocks" | tee -a "$TEST_RESULTS"
echo "ERROR: failed to use SHA256_HashMultipleBlocks_SSE2" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0" && "$X86_SHA256_HASH_BLOCKS" -ne "0") ]]; then
echo "Verified rotate immediate machine instructions (X86_SHA256_HashBlocks)" | tee -a "$TEST_RESULTS"
echo "Verified rotate immediate machine instructions (SHA256_HashMultipleBlocks_SSE2)" | tee -a "$TEST_RESULTS"
elif [[ ("$FAILED" -eq "0") ]]; then
echo "Verified rotate immediate machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# Test CRC-32C code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -msse4.2 adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
X86_CRC32=1
fi
if [[ ("$X86_CRC32" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=crc-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32l)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified crc32b and crc32l machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# Test AES-NI code generation
@ -1216,8 +1254,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo "Testing: X86 AES-NI code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=rijndael.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
OBJFILE=rijndael-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
@ -1278,8 +1316,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo "Testing: X86 carryless multiply code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=gcm.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
OBJFILE=gcm-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
@ -1321,7 +1359,7 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo
OBJFILE=rdrand.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
@ -1348,44 +1386,6 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
fi
fi
############################################
# X86 CRC32 code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -msse4.2 adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
X86_CRC32=1
fi
if [[ ("$X86_CRC32" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=crc.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32l)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified crc32l and crc32b machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# X86 SHA code generation
@ -1400,8 +1400,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo "Testing: X86 SHA code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=sha.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
OBJFILE=sha-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
@ -1469,7 +1469,7 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]
echo "Testing: ARM NEON code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=aria.o; rm -f "$OBJFILE" 2>/dev/null
OBJFILE=aria-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
@ -1516,51 +1516,21 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]
fi
fi
############################################
# ARM carryless multiply code generation
ARM_PMULL=$(echo -n "$ARM_CPU_FLAGS" | "$GREP" -i -c pmull)
if [[ ("$ARM_PMULL" -ne "0" || "$HAVE_ARM_CRYPTO" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM carryless multiply code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=gcm.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -v pmull2 | "$GREP" -i -c pmull)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate pmull instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c pmull2)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate pmull2 instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified pmull and pmull2 machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# ARM CRC32 code generation
ARM_CRC32=$(echo -n "$ARM_CPU_FLAGS" | "$GREP" -i -c crc32)
if [[ ("$ARM_CRC32" -ne "0") ]]; then
"$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crc adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
ARM_CRC32=1
fi
if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_CRC32" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM CRC32 code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=crc.o; rm -f "$OBJFILE" 2>/dev/null
OBJFILE=crc-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
@ -1595,6 +1565,179 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]
echo "Verified crc32cb, crc32cw, crc32b and crc32w machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# ARM carryless multiply code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
ARM_PMULL=1
fi
if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_PMULL" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM carryless multiply code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=gcm-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -v pmull2 | "$GREP" -i -c pmull)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate pmull instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c pmull2)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate pmull2 instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified pmull and pmull2 machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# ARM SHA code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
ARM_AES=1
fi
if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_AES" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM AES generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=rijndael-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aese)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate aese instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesmc)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate aesmc instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesd)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate aesd instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesimc)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate aesimc instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified aese, aesd, aesmc, aesimc machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# ARM SHA code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
ARM_SHA=1
fi
if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_SHA" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM SHA generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=sha-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1c)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1c instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1m)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1m instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1p)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1p instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1h)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1h instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1su0)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1su0 instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1su1)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1su1 instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -v sha256h2 | "$GREP" -i -c sha256h)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha256h instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256h2)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha256h2 instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256su0)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha256su0 instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256su1)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha256su1 instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified sha1c, sha1m, sha1p, sha1su0, sha1su1, sha256h, sha256h2, sha256su0, sha256su1 machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
fi
############################################

143
aria-simd.cpp Normal file
View File

@ -0,0 +1,143 @@
// aria-simd.cpp - written and placed in the public domain by
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
//
// This source file uses intrinsics to gain access to ARMv7a and
// ARMv8a NEON instructions. A separate source file is needed
// because additional CXXFLAGS are required to enable the
// appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "misc.h"
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
# include "arm_neon.h"
#endif
#if (CRYPTOPP_SSSE3_AVAILABLE)
# include "tmmintrin.h"
#endif
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(ARIATab)
extern const word32 S1[256];
extern const word32 S2[256];
extern const word32 X1[256];
extern const word32 X2[256];
extern const word32 KRK[3][4];
NAMESPACE_END
NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
using CryptoPP::ARIATab::S1;
using CryptoPP::ARIATab::S2;
using CryptoPP::ARIATab::X1;
using CryptoPP::ARIATab::X2;
using CryptoPP::ARIATab::KRK;
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
template <unsigned int N>
inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16])
{
static const unsigned int Q1 = (4-(N/32)) % 4;
static const unsigned int Q2 = (3-(N/32)) % 4;
static const unsigned int R = N % 32;
vst1q_u32(reinterpret_cast<uint32_t*>(RK),
veorq_u32(X, veorq_u32(
vshrq_n_u32(vextq_u32(Y, Y, Q1), R),
vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R))));
}
void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen)
{
const uint32x4_t w0 = vld1q_u32((const uint32_t*)(ws+ 0));
const uint32x4_t w1 = vld1q_u32((const uint32_t*)(ws+ 8));
const uint32x4_t w2 = vld1q_u32((const uint32_t*)(ws+12));
const uint32x4_t w3 = vld1q_u32((const uint32_t*)(ws+16));
ARIA_GSRK_NEON<19>(w0, w1, rk + 0);
ARIA_GSRK_NEON<19>(w1, w2, rk + 16);
ARIA_GSRK_NEON<19>(w2, w3, rk + 32);
ARIA_GSRK_NEON<19>(w3, w0, rk + 48);
ARIA_GSRK_NEON<31>(w0, w1, rk + 64);
ARIA_GSRK_NEON<31>(w1, w2, rk + 80);
ARIA_GSRK_NEON<31>(w2, w3, rk + 96);
ARIA_GSRK_NEON<31>(w3, w0, rk + 112);
ARIA_GSRK_NEON<67>(w0, w1, rk + 128);
ARIA_GSRK_NEON<67>(w1, w2, rk + 144);
ARIA_GSRK_NEON<67>(w2, w3, rk + 160);
ARIA_GSRK_NEON<67>(w3, w0, rk + 176);
ARIA_GSRK_NEON<97>(w0, w1, rk + 192);
if (keylen > 16)
{
ARIA_GSRK_NEON<97>(w1, w2, rk + 208);
ARIA_GSRK_NEON<97>(w2, w3, rk + 224);
if (keylen > 24)
{
ARIA_GSRK_NEON< 97>(w3, w0, rk + 240);
ARIA_GSRK_NEON<109>(w0, w1, rk + 256);
}
}
}
void ARIA_ProcessAndXorBlock_Xor_NEON(const byte* xorBlock, byte* outBlock)
{
vst1q_u32(reinterpret_cast<uint32_t*>(outBlock), veorq_u32(
vld1q_u32(reinterpret_cast<const uint32_t*>(outBlock)),
vld1q_u32(reinterpret_cast<const uint32_t*>(xorBlock))));
}
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_SSSE3_AVAILABLE)
inline byte ARIA_BRF(const word32 x, const int y) {
return GETBYTE(x, y);
}
void ARIA_ProcessAndXorBlock_Xor_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
{
const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
// 'outBlock' may be unaligned.
_mm_storeu_si128(reinterpret_cast<__m128i*>(outBlock),
_mm_xor_si128(_mm_loadu_si128((const __m128i*)(outBlock)),
_mm_shuffle_epi8(_mm_load_si128((const __m128i*)(rk)), MASK)));
// 'outBlock' and 'xorBlock' may be unaligned.
if (xorBlock != NULLPTR)
{
_mm_storeu_si128((__m128i*)(outBlock),
_mm_xor_si128(
_mm_loadu_si128((const __m128i*)(outBlock)),
_mm_loadu_si128((const __m128i*)(xorBlock))));
}
}
#endif // CRYPTOPP_SSSE3_AVAILABLE
NAMESPACE_END

446
aria.cpp
View File

@ -7,175 +7,34 @@
#include "misc.h"
#include "cpu.h"
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#if CRYPTOPP_SSE2_AVAILABLE
# define CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS 1
#endif
#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE
#if CRYPTOPP_SSSE3_AVAILABLE
# define CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS 1
#endif
#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
# define CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS 1
#endif
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(ARIATab)
ANONYMOUS_NAMESPACE_BEGIN
extern const word32 S1[256];
extern const word32 S2[256];
extern const word32 X1[256];
extern const word32 X2[256];
extern const word32 KRK[3][4];
CRYPTOPP_ALIGN_DATA(16)
const CryptoPP::word32 S1[256]={
0x00636363,0x007c7c7c,0x00777777,0x007b7b7b,0x00f2f2f2,0x006b6b6b,0x006f6f6f,0x00c5c5c5,
0x00303030,0x00010101,0x00676767,0x002b2b2b,0x00fefefe,0x00d7d7d7,0x00ababab,0x00767676,
0x00cacaca,0x00828282,0x00c9c9c9,0x007d7d7d,0x00fafafa,0x00595959,0x00474747,0x00f0f0f0,
0x00adadad,0x00d4d4d4,0x00a2a2a2,0x00afafaf,0x009c9c9c,0x00a4a4a4,0x00727272,0x00c0c0c0,
0x00b7b7b7,0x00fdfdfd,0x00939393,0x00262626,0x00363636,0x003f3f3f,0x00f7f7f7,0x00cccccc,
0x00343434,0x00a5a5a5,0x00e5e5e5,0x00f1f1f1,0x00717171,0x00d8d8d8,0x00313131,0x00151515,
0x00040404,0x00c7c7c7,0x00232323,0x00c3c3c3,0x00181818,0x00969696,0x00050505,0x009a9a9a,
0x00070707,0x00121212,0x00808080,0x00e2e2e2,0x00ebebeb,0x00272727,0x00b2b2b2,0x00757575,
0x00090909,0x00838383,0x002c2c2c,0x001a1a1a,0x001b1b1b,0x006e6e6e,0x005a5a5a,0x00a0a0a0,
0x00525252,0x003b3b3b,0x00d6d6d6,0x00b3b3b3,0x00292929,0x00e3e3e3,0x002f2f2f,0x00848484,
0x00535353,0x00d1d1d1,0x00000000,0x00ededed,0x00202020,0x00fcfcfc,0x00b1b1b1,0x005b5b5b,
0x006a6a6a,0x00cbcbcb,0x00bebebe,0x00393939,0x004a4a4a,0x004c4c4c,0x00585858,0x00cfcfcf,
0x00d0d0d0,0x00efefef,0x00aaaaaa,0x00fbfbfb,0x00434343,0x004d4d4d,0x00333333,0x00858585,
0x00454545,0x00f9f9f9,0x00020202,0x007f7f7f,0x00505050,0x003c3c3c,0x009f9f9f,0x00a8a8a8,
0x00515151,0x00a3a3a3,0x00404040,0x008f8f8f,0x00929292,0x009d9d9d,0x00383838,0x00f5f5f5,
0x00bcbcbc,0x00b6b6b6,0x00dadada,0x00212121,0x00101010,0x00ffffff,0x00f3f3f3,0x00d2d2d2,
0x00cdcdcd,0x000c0c0c,0x00131313,0x00ececec,0x005f5f5f,0x00979797,0x00444444,0x00171717,
0x00c4c4c4,0x00a7a7a7,0x007e7e7e,0x003d3d3d,0x00646464,0x005d5d5d,0x00191919,0x00737373,
0x00606060,0x00818181,0x004f4f4f,0x00dcdcdc,0x00222222,0x002a2a2a,0x00909090,0x00888888,
0x00464646,0x00eeeeee,0x00b8b8b8,0x00141414,0x00dedede,0x005e5e5e,0x000b0b0b,0x00dbdbdb,
0x00e0e0e0,0x00323232,0x003a3a3a,0x000a0a0a,0x00494949,0x00060606,0x00242424,0x005c5c5c,
0x00c2c2c2,0x00d3d3d3,0x00acacac,0x00626262,0x00919191,0x00959595,0x00e4e4e4,0x00797979,
0x00e7e7e7,0x00c8c8c8,0x00373737,0x006d6d6d,0x008d8d8d,0x00d5d5d5,0x004e4e4e,0x00a9a9a9,
0x006c6c6c,0x00565656,0x00f4f4f4,0x00eaeaea,0x00656565,0x007a7a7a,0x00aeaeae,0x00080808,
0x00bababa,0x00787878,0x00252525,0x002e2e2e,0x001c1c1c,0x00a6a6a6,0x00b4b4b4,0x00c6c6c6,
0x00e8e8e8,0x00dddddd,0x00747474,0x001f1f1f,0x004b4b4b,0x00bdbdbd,0x008b8b8b,0x008a8a8a,
0x00707070,0x003e3e3e,0x00b5b5b5,0x00666666,0x00484848,0x00030303,0x00f6f6f6,0x000e0e0e,
0x00616161,0x00353535,0x00575757,0x00b9b9b9,0x00868686,0x00c1c1c1,0x001d1d1d,0x009e9e9e,
0x00e1e1e1,0x00f8f8f8,0x00989898,0x00111111,0x00696969,0x00d9d9d9,0x008e8e8e,0x00949494,
0x009b9b9b,0x001e1e1e,0x00878787,0x00e9e9e9,0x00cecece,0x00555555,0x00282828,0x00dfdfdf,
0x008c8c8c,0x00a1a1a1,0x00898989,0x000d0d0d,0x00bfbfbf,0x00e6e6e6,0x00424242,0x00686868,
0x00414141,0x00999999,0x002d2d2d,0x000f0f0f,0x00b0b0b0,0x00545454,0x00bbbbbb,0x00161616
};
CRYPTOPP_ALIGN_DATA(16)
const CryptoPP::word32 S2[256]={
0xe200e2e2,0x4e004e4e,0x54005454,0xfc00fcfc,0x94009494,0xc200c2c2,0x4a004a4a,0xcc00cccc,
0x62006262,0x0d000d0d,0x6a006a6a,0x46004646,0x3c003c3c,0x4d004d4d,0x8b008b8b,0xd100d1d1,
0x5e005e5e,0xfa00fafa,0x64006464,0xcb00cbcb,0xb400b4b4,0x97009797,0xbe00bebe,0x2b002b2b,
0xbc00bcbc,0x77007777,0x2e002e2e,0x03000303,0xd300d3d3,0x19001919,0x59005959,0xc100c1c1,
0x1d001d1d,0x06000606,0x41004141,0x6b006b6b,0x55005555,0xf000f0f0,0x99009999,0x69006969,
0xea00eaea,0x9c009c9c,0x18001818,0xae00aeae,0x63006363,0xdf00dfdf,0xe700e7e7,0xbb00bbbb,
0x00000000,0x73007373,0x66006666,0xfb00fbfb,0x96009696,0x4c004c4c,0x85008585,0xe400e4e4,
0x3a003a3a,0x09000909,0x45004545,0xaa00aaaa,0x0f000f0f,0xee00eeee,0x10001010,0xeb00ebeb,
0x2d002d2d,0x7f007f7f,0xf400f4f4,0x29002929,0xac00acac,0xcf00cfcf,0xad00adad,0x91009191,
0x8d008d8d,0x78007878,0xc800c8c8,0x95009595,0xf900f9f9,0x2f002f2f,0xce00cece,0xcd00cdcd,
0x08000808,0x7a007a7a,0x88008888,0x38003838,0x5c005c5c,0x83008383,0x2a002a2a,0x28002828,
0x47004747,0xdb00dbdb,0xb800b8b8,0xc700c7c7,0x93009393,0xa400a4a4,0x12001212,0x53005353,
0xff00ffff,0x87008787,0x0e000e0e,0x31003131,0x36003636,0x21002121,0x58005858,0x48004848,
0x01000101,0x8e008e8e,0x37003737,0x74007474,0x32003232,0xca00caca,0xe900e9e9,0xb100b1b1,
0xb700b7b7,0xab00abab,0x0c000c0c,0xd700d7d7,0xc400c4c4,0x56005656,0x42004242,0x26002626,
0x07000707,0x98009898,0x60006060,0xd900d9d9,0xb600b6b6,0xb900b9b9,0x11001111,0x40004040,
0xec00ecec,0x20002020,0x8c008c8c,0xbd00bdbd,0xa000a0a0,0xc900c9c9,0x84008484,0x04000404,
0x49004949,0x23002323,0xf100f1f1,0x4f004f4f,0x50005050,0x1f001f1f,0x13001313,0xdc00dcdc,
0xd800d8d8,0xc000c0c0,0x9e009e9e,0x57005757,0xe300e3e3,0xc300c3c3,0x7b007b7b,0x65006565,
0x3b003b3b,0x02000202,0x8f008f8f,0x3e003e3e,0xe800e8e8,0x25002525,0x92009292,0xe500e5e5,
0x15001515,0xdd00dddd,0xfd00fdfd,0x17001717,0xa900a9a9,0xbf00bfbf,0xd400d4d4,0x9a009a9a,
0x7e007e7e,0xc500c5c5,0x39003939,0x67006767,0xfe00fefe,0x76007676,0x9d009d9d,0x43004343,
0xa700a7a7,0xe100e1e1,0xd000d0d0,0xf500f5f5,0x68006868,0xf200f2f2,0x1b001b1b,0x34003434,
0x70007070,0x05000505,0xa300a3a3,0x8a008a8a,0xd500d5d5,0x79007979,0x86008686,0xa800a8a8,
0x30003030,0xc600c6c6,0x51005151,0x4b004b4b,0x1e001e1e,0xa600a6a6,0x27002727,0xf600f6f6,
0x35003535,0xd200d2d2,0x6e006e6e,0x24002424,0x16001616,0x82008282,0x5f005f5f,0xda00dada,
0xe600e6e6,0x75007575,0xa200a2a2,0xef00efef,0x2c002c2c,0xb200b2b2,0x1c001c1c,0x9f009f9f,
0x5d005d5d,0x6f006f6f,0x80008080,0x0a000a0a,0x72007272,0x44004444,0x9b009b9b,0x6c006c6c,
0x90009090,0x0b000b0b,0x5b005b5b,0x33003333,0x7d007d7d,0x5a005a5a,0x52005252,0xf300f3f3,
0x61006161,0xa100a1a1,0xf700f7f7,0xb000b0b0,0xd600d6d6,0x3f003f3f,0x7c007c7c,0x6d006d6d,
0xed00eded,0x14001414,0xe000e0e0,0xa500a5a5,0x3d003d3d,0x22002222,0xb300b3b3,0xf800f8f8,
0x89008989,0xde00dede,0x71007171,0x1a001a1a,0xaf00afaf,0xba00baba,0xb500b5b5,0x81008181
};
CRYPTOPP_ALIGN_DATA(16)
const CryptoPP::word32 X1[256]={
0x52520052,0x09090009,0x6a6a006a,0xd5d500d5,0x30300030,0x36360036,0xa5a500a5,0x38380038,
0xbfbf00bf,0x40400040,0xa3a300a3,0x9e9e009e,0x81810081,0xf3f300f3,0xd7d700d7,0xfbfb00fb,
0x7c7c007c,0xe3e300e3,0x39390039,0x82820082,0x9b9b009b,0x2f2f002f,0xffff00ff,0x87870087,
0x34340034,0x8e8e008e,0x43430043,0x44440044,0xc4c400c4,0xdede00de,0xe9e900e9,0xcbcb00cb,
0x54540054,0x7b7b007b,0x94940094,0x32320032,0xa6a600a6,0xc2c200c2,0x23230023,0x3d3d003d,
0xeeee00ee,0x4c4c004c,0x95950095,0x0b0b000b,0x42420042,0xfafa00fa,0xc3c300c3,0x4e4e004e,
0x08080008,0x2e2e002e,0xa1a100a1,0x66660066,0x28280028,0xd9d900d9,0x24240024,0xb2b200b2,
0x76760076,0x5b5b005b,0xa2a200a2,0x49490049,0x6d6d006d,0x8b8b008b,0xd1d100d1,0x25250025,
0x72720072,0xf8f800f8,0xf6f600f6,0x64640064,0x86860086,0x68680068,0x98980098,0x16160016,
0xd4d400d4,0xa4a400a4,0x5c5c005c,0xcccc00cc,0x5d5d005d,0x65650065,0xb6b600b6,0x92920092,
0x6c6c006c,0x70700070,0x48480048,0x50500050,0xfdfd00fd,0xeded00ed,0xb9b900b9,0xdada00da,
0x5e5e005e,0x15150015,0x46460046,0x57570057,0xa7a700a7,0x8d8d008d,0x9d9d009d,0x84840084,
0x90900090,0xd8d800d8,0xabab00ab,0x00000000,0x8c8c008c,0xbcbc00bc,0xd3d300d3,0x0a0a000a,
0xf7f700f7,0xe4e400e4,0x58580058,0x05050005,0xb8b800b8,0xb3b300b3,0x45450045,0x06060006,
0xd0d000d0,0x2c2c002c,0x1e1e001e,0x8f8f008f,0xcaca00ca,0x3f3f003f,0x0f0f000f,0x02020002,
0xc1c100c1,0xafaf00af,0xbdbd00bd,0x03030003,0x01010001,0x13130013,0x8a8a008a,0x6b6b006b,
0x3a3a003a,0x91910091,0x11110011,0x41410041,0x4f4f004f,0x67670067,0xdcdc00dc,0xeaea00ea,
0x97970097,0xf2f200f2,0xcfcf00cf,0xcece00ce,0xf0f000f0,0xb4b400b4,0xe6e600e6,0x73730073,
0x96960096,0xacac00ac,0x74740074,0x22220022,0xe7e700e7,0xadad00ad,0x35350035,0x85850085,
0xe2e200e2,0xf9f900f9,0x37370037,0xe8e800e8,0x1c1c001c,0x75750075,0xdfdf00df,0x6e6e006e,
0x47470047,0xf1f100f1,0x1a1a001a,0x71710071,0x1d1d001d,0x29290029,0xc5c500c5,0x89890089,
0x6f6f006f,0xb7b700b7,0x62620062,0x0e0e000e,0xaaaa00aa,0x18180018,0xbebe00be,0x1b1b001b,
0xfcfc00fc,0x56560056,0x3e3e003e,0x4b4b004b,0xc6c600c6,0xd2d200d2,0x79790079,0x20200020,
0x9a9a009a,0xdbdb00db,0xc0c000c0,0xfefe00fe,0x78780078,0xcdcd00cd,0x5a5a005a,0xf4f400f4,
0x1f1f001f,0xdddd00dd,0xa8a800a8,0x33330033,0x88880088,0x07070007,0xc7c700c7,0x31310031,
0xb1b100b1,0x12120012,0x10100010,0x59590059,0x27270027,0x80800080,0xecec00ec,0x5f5f005f,
0x60600060,0x51510051,0x7f7f007f,0xa9a900a9,0x19190019,0xb5b500b5,0x4a4a004a,0x0d0d000d,
0x2d2d002d,0xe5e500e5,0x7a7a007a,0x9f9f009f,0x93930093,0xc9c900c9,0x9c9c009c,0xefef00ef,
0xa0a000a0,0xe0e000e0,0x3b3b003b,0x4d4d004d,0xaeae00ae,0x2a2a002a,0xf5f500f5,0xb0b000b0,
0xc8c800c8,0xebeb00eb,0xbbbb00bb,0x3c3c003c,0x83830083,0x53530053,0x99990099,0x61610061,
0x17170017,0x2b2b002b,0x04040004,0x7e7e007e,0xbaba00ba,0x77770077,0xd6d600d6,0x26260026,
0xe1e100e1,0x69690069,0x14140014,0x63630063,0x55550055,0x21210021,0x0c0c000c,0x7d7d007d
};
CRYPTOPP_ALIGN_DATA(16)
const CryptoPP::word32 X2[256]={
0x30303000,0x68686800,0x99999900,0x1b1b1b00,0x87878700,0xb9b9b900,0x21212100,0x78787800,
0x50505000,0x39393900,0xdbdbdb00,0xe1e1e100,0x72727200,0x09090900,0x62626200,0x3c3c3c00,
0x3e3e3e00,0x7e7e7e00,0x5e5e5e00,0x8e8e8e00,0xf1f1f100,0xa0a0a000,0xcccccc00,0xa3a3a300,
0x2a2a2a00,0x1d1d1d00,0xfbfbfb00,0xb6b6b600,0xd6d6d600,0x20202000,0xc4c4c400,0x8d8d8d00,
0x81818100,0x65656500,0xf5f5f500,0x89898900,0xcbcbcb00,0x9d9d9d00,0x77777700,0xc6c6c600,
0x57575700,0x43434300,0x56565600,0x17171700,0xd4d4d400,0x40404000,0x1a1a1a00,0x4d4d4d00,
0xc0c0c000,0x63636300,0x6c6c6c00,0xe3e3e300,0xb7b7b700,0xc8c8c800,0x64646400,0x6a6a6a00,
0x53535300,0xaaaaaa00,0x38383800,0x98989800,0x0c0c0c00,0xf4f4f400,0x9b9b9b00,0xededed00,
0x7f7f7f00,0x22222200,0x76767600,0xafafaf00,0xdddddd00,0x3a3a3a00,0x0b0b0b00,0x58585800,
0x67676700,0x88888800,0x06060600,0xc3c3c300,0x35353500,0x0d0d0d00,0x01010100,0x8b8b8b00,
0x8c8c8c00,0xc2c2c200,0xe6e6e600,0x5f5f5f00,0x02020200,0x24242400,0x75757500,0x93939300,
0x66666600,0x1e1e1e00,0xe5e5e500,0xe2e2e200,0x54545400,0xd8d8d800,0x10101000,0xcecece00,
0x7a7a7a00,0xe8e8e800,0x08080800,0x2c2c2c00,0x12121200,0x97979700,0x32323200,0xababab00,
0xb4b4b400,0x27272700,0x0a0a0a00,0x23232300,0xdfdfdf00,0xefefef00,0xcacaca00,0xd9d9d900,
0xb8b8b800,0xfafafa00,0xdcdcdc00,0x31313100,0x6b6b6b00,0xd1d1d100,0xadadad00,0x19191900,
0x49494900,0xbdbdbd00,0x51515100,0x96969600,0xeeeeee00,0xe4e4e400,0xa8a8a800,0x41414100,
0xdadada00,0xffffff00,0xcdcdcd00,0x55555500,0x86868600,0x36363600,0xbebebe00,0x61616100,
0x52525200,0xf8f8f800,0xbbbbbb00,0x0e0e0e00,0x82828200,0x48484800,0x69696900,0x9a9a9a00,
0xe0e0e000,0x47474700,0x9e9e9e00,0x5c5c5c00,0x04040400,0x4b4b4b00,0x34343400,0x15151500,
0x79797900,0x26262600,0xa7a7a700,0xdedede00,0x29292900,0xaeaeae00,0x92929200,0xd7d7d700,
0x84848400,0xe9e9e900,0xd2d2d200,0xbababa00,0x5d5d5d00,0xf3f3f300,0xc5c5c500,0xb0b0b000,
0xbfbfbf00,0xa4a4a400,0x3b3b3b00,0x71717100,0x44444400,0x46464600,0x2b2b2b00,0xfcfcfc00,
0xebebeb00,0x6f6f6f00,0xd5d5d500,0xf6f6f600,0x14141400,0xfefefe00,0x7c7c7c00,0x70707000,
0x5a5a5a00,0x7d7d7d00,0xfdfdfd00,0x2f2f2f00,0x18181800,0x83838300,0x16161600,0xa5a5a500,
0x91919100,0x1f1f1f00,0x05050500,0x95959500,0x74747400,0xa9a9a900,0xc1c1c100,0x5b5b5b00,
0x4a4a4a00,0x85858500,0x6d6d6d00,0x13131300,0x07070700,0x4f4f4f00,0x4e4e4e00,0x45454500,
0xb2b2b200,0x0f0f0f00,0xc9c9c900,0x1c1c1c00,0xa6a6a600,0xbcbcbc00,0xececec00,0x73737300,
0x90909000,0x7b7b7b00,0xcfcfcf00,0x59595900,0x8f8f8f00,0xa1a1a100,0xf9f9f900,0x2d2d2d00,
0xf2f2f200,0xb1b1b100,0x00000000,0x94949400,0x37373700,0x9f9f9f00,0xd0d0d000,0x2e2e2e00,
0x9c9c9c00,0x6e6e6e00,0x28282800,0x3f3f3f00,0x80808000,0xf0f0f000,0x3d3d3d00,0xd3d3d300,
0x25252500,0x8a8a8a00,0xb5b5b500,0xe7e7e700,0x42424200,0xb3b3b300,0xc7c7c700,0xeaeaea00,
0xf7f7f700,0x4c4c4c00,0x11111100,0x33333300,0x03030300,0xa2a2a200,0xacacac00,0x60606000
};
CRYPTOPP_ALIGN_DATA(16)
const CryptoPP::word32 KRK[3][4] = {
{0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0},
{0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0},
{0xdb92371d, 0x2126e970, 0x03249775, 0x04e8c90e}
};
ANONYMOUS_NAMESPACE_END
NAMESPACE_END
NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
using CryptoPP::ARIATab::S1;
using CryptoPP::ARIATab::S2;
using CryptoPP::ARIATab::X1;
using CryptoPP::ARIATab::X2;
using CryptoPP::ARIATab::KRK;
typedef BlockGetAndPut<word32, BigEndian, false, false> BigEndianBlock;
typedef BlockGetAndPut<word32, NativeByteOrder, true, true> NativeEndianBlock;
@ -222,6 +81,15 @@ inline byte ARIA_BRF(const word32 x, const int y) {
#define ARIA_FO {SBL1_M(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])}
#define ARIA_FE {SBL2_M(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[2],t[3],t[0],t[1]) ARIA_MM(t[0],t[1],t[2],t[3])}
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
extern void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen);
extern void ARIA_ProcessAndXorBlock_Xor_NEON(const byte* xorBlock, byte* outblock);
#endif
#if (CRYPTOPP_SSSE3_AVAILABLE)
extern void ARIA_ProcessAndXorBlock_Xor_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t);
#endif
// n-bit right shift of Y XORed to X
template <unsigned int N>
inline void ARIA_GSRK(const word32 X[4], const word32 Y[4], byte RK[16])
@ -235,21 +103,6 @@ inline void ARIA_GSRK(const word32 X[4], const word32 Y[4], byte RK[16])
reinterpret_cast<word32*>(RK)[3] = (X[3]) ^ ((Y[(Q+3)%4])>>R) ^ ((Y[(Q+2)%4])<<(32-R));
}
#if CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS
template <unsigned int N>
inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16])
{
static const unsigned int Q1 = (4-(N/32)) % 4;
static const unsigned int Q2 = (3-(N/32)) % 4;
static const unsigned int R = N % 32;
vst1q_u32(reinterpret_cast<uint32_t*>(RK),
veorq_u32(X, veorq_u32(
vshrq_n_u32(vextq_u32(Y, Y, Q1), R),
vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R))));
}
#endif
void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const NameValuePairs &params)
{
CRYPTOPP_UNUSED(params);
@ -280,144 +133,51 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
// w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t and u are 16 byte temp areas.
word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20;
#if CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS
const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
if (HasSSSE3())
BigEndianBlock::Get(mk)(w0[0])(w0[1])(w0[2])(w0[3]);
t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1];
t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3];
ARIA_FO;
if (keylen == 32)
{
// 'mk' may be unaligned.
const __m128i w = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(mk)), MASK);
_mm_store_si128((__m128i*)w0, w);
_mm_store_si128((__m128i*)t, _mm_xor_si128(w, _mm_load_si128((const __m128i*)(KRK[q]))));
ARIA_FO;
if (keylen == 32)
{
// 'mk' may be unaligned.
_mm_store_si128(reinterpret_cast<__m128i*>(w1),
_mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(mk+16)), MASK));
}
else if (keylen == 24)
{
BigEndianBlock::Get(mk+16)(w1[0])(w1[1]);
w1[2] = w1[3] = 0;
}
else
{
w1[0]=w1[1]=w1[2]=w1[3]=0;
}
BigEndianBlock::Get(mk+16)(w1[0])(w1[1])(w1[2])(w1[3]);
}
else if (keylen == 24)
{
BigEndianBlock::Get(mk+16)(w1[0])(w1[1]);
w1[2] = w1[3] = 0;
}
else
#endif // CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS
{
BigEndianBlock::Get(mk)(w0[0])(w0[1])(w0[2])(w0[3]);
t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1];
t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3];
ARIA_FO;
if (keylen == 32)
{
BigEndianBlock::Get(mk+16)(w1[0])(w1[1])(w1[2])(w1[3]);
}
else if (keylen == 24)
{
BigEndianBlock::Get(mk+16)(w1[0])(w1[1]);
w1[2] = w1[3] = 0;
}
else
{
w1[0]=w1[1]=w1[2]=w1[3]=0;
}
w1[0]=w1[1]=w1[2]=w1[3]=0;
}
#if CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS
if (HasSSE2())
{
const __m128i x = _mm_xor_si128(
_mm_load_si128((const __m128i*)(w1)),
_mm_load_si128((const __m128i*)(t)));
_mm_store_si128((__m128i*)(w1), x);
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
::memcpy(t, w1, 16);
q = (q==2) ? 0 : (q+1);
_mm_store_si128((__m128i*)(t), _mm_xor_si128(x,
_mm_load_si128((const __m128i*)(KRK[q]))));
q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
ARIA_FE;
ARIA_FE;
const __m128i y = _mm_xor_si128(
_mm_load_si128((const __m128i*)(w0)),
_mm_load_si128((const __m128i*)(t)));
_mm_store_si128((__m128i*)(w2), y);
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
::memcpy(w2, t, 16);
q = (q==2) ? 0 : (q+1);
_mm_store_si128((__m128i*)(t), _mm_xor_si128(y,
_mm_load_si128((const __m128i*)(KRK[q]))));
q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
ARIA_FO;
ARIA_FO;
_mm_store_si128((__m128i*)(w3), _mm_xor_si128(
_mm_load_si128((const __m128i*)(w1)),
_mm_load_si128((const __m128i*)(t))));
}
else
#endif // CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS
{
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
::memcpy(t, w1, 16);
w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
ARIA_FE;
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
::memcpy(w2, t, 16);
q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
ARIA_FO;
w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
}
#if CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS
#if CRYPTOPP_ARM_NEON_AVAILABLE
if (HasNEON())
{
const uint32x4_t w0 = vld1q_u32((const uint32_t*)(m_w.data()+0));
const uint32x4_t w1 = vld1q_u32((const uint32_t*)(m_w.data()+8));
const uint32x4_t w2 = vld1q_u32((const uint32_t*)(m_w.data()+12));
const uint32x4_t w3 = vld1q_u32((const uint32_t*)(m_w.data()+16));
ARIA_GSRK_NEON<19>(w0, w1, rk + 0);
ARIA_GSRK_NEON<19>(w1, w2, rk + 16);
ARIA_GSRK_NEON<19>(w2, w3, rk + 32);
ARIA_GSRK_NEON<19>(w3, w0, rk + 48);
ARIA_GSRK_NEON<31>(w0, w1, rk + 64);
ARIA_GSRK_NEON<31>(w1, w2, rk + 80);
ARIA_GSRK_NEON<31>(w2, w3, rk + 96);
ARIA_GSRK_NEON<31>(w3, w0, rk + 112);
ARIA_GSRK_NEON<67>(w0, w1, rk + 128);
ARIA_GSRK_NEON<67>(w1, w2, rk + 144);
ARIA_GSRK_NEON<67>(w2, w3, rk + 160);
ARIA_GSRK_NEON<67>(w3, w0, rk + 176);
ARIA_GSRK_NEON<97>(w0, w1, rk + 192);
if (keylen > 16)
{
ARIA_GSRK_NEON<97>(w1, w2, rk + 208);
ARIA_GSRK_NEON<97>(w2, w3, rk + 224);
if (keylen > 24)
{
ARIA_GSRK_NEON< 97>(w3, w0, rk + 240);
ARIA_GSRK_NEON<109>(w0, w1, rk + 256);
}
}
ARIA_UncheckedSetKey_Schedule_NEON(rk, m_w, keylen);
}
else
#endif // CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
{
ARIA_GSRK<19>(w0, w1, rk + 0);
ARIA_GSRK<19>(w1, w2, rk + 16);
@ -453,53 +213,24 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
rk = m_rk.data();
r = R; q = Q;
#if CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS && !defined(__SUNPRO_CC)
if (HasSSE2())
a=reinterpret_cast<word32*>(rk); s=m_w.data()+24; z=a+r*4;
::memcpy(t, a, 16); ::memcpy(a, z, 16); ::memcpy(z, t, 16);
a+=4; z-=4;
for (; a<z; a+=4, z-=4)
{
a=reinterpret_cast<word32*>(rk); s=m_w.data()+24; z=a+r*4;
_mm_store_si128((__m128i*)t, _mm_load_si128((const __m128i*)a));
_mm_store_si128((__m128i*)a, _mm_load_si128((const __m128i*)z));
_mm_store_si128((__m128i*)z, _mm_load_si128((const __m128i*)t));
a+=4; z-=4;
for (; a<z; a+=4, z-=4)
{
ARIA_M(a[0],t[0]); ARIA_M(a[1],t[1]); ARIA_M(a[2],t[2]); ARIA_M(a[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
_mm_store_si128((__m128i*)s, _mm_load_si128((const __m128i*)t));
ARIA_M(z[0],t[0]); ARIA_M(z[1],t[1]); ARIA_M(z[2],t[2]); ARIA_M(z[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
_mm_store_si128((__m128i*)a, _mm_load_si128((const __m128i*)t));
_mm_store_si128((__m128i*)z, _mm_load_si128((const __m128i*)s));
}
ARIA_M(a[0],t[0]); ARIA_M(a[1],t[1]); ARIA_M(a[2],t[2]); ARIA_M(a[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
_mm_store_si128((__m128i*)z, _mm_load_si128((const __m128i*)t));
}
else
#endif
{
a=reinterpret_cast<word32*>(rk); s=m_w.data()+24; z=a+r*4;
::memcpy(t, a, 16); ::memcpy(a, z, 16); ::memcpy(z, t, 16);
::memcpy(s, t, 16);
a+=4; z-=4;
for (; a<z; a+=4, z-=4)
{
ARIA_M(a[0],t[0]); ARIA_M(a[1],t[1]); ARIA_M(a[2],t[2]); ARIA_M(a[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
::memcpy(s, t, 16);
ARIA_M(z[0],t[0]); ARIA_M(z[1],t[1]); ARIA_M(z[2],t[2]); ARIA_M(z[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
::memcpy(a, t, 16); ::memcpy(z, s, 16);
}
ARIA_M(a[0],t[0]); ARIA_M(a[1],t[1]); ARIA_M(a[2],t[2]); ARIA_M(a[3],t[3]);
ARIA_M(z[0],t[0]); ARIA_M(z[1],t[1]); ARIA_M(z[2],t[2]); ARIA_M(z[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
::memcpy(z, t, 16);
::memcpy(a, t, 16); ::memcpy(z, s, 16);
}
ARIA_M(a[0],t[0]); ARIA_M(a[1],t[1]); ARIA_M(a[2],t[2]); ARIA_M(a[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
::memcpy(z, t, 16);
}
}
@ -540,39 +271,9 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b
#ifdef IS_LITTLE_ENDIAN
# if CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS
const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
if (HasSSSE3())
{
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
// 'outBlock' may be unaligned.
_mm_storeu_si128(reinterpret_cast<__m128i*>(outBlock),
_mm_xor_si128(_mm_loadu_si128((const __m128i*)(outBlock)),
_mm_shuffle_epi8(_mm_load_si128((const __m128i*)(rk)), MASK)));
// 'outBlock' and 'xorBlock' may be unaligned.
if (xorBlock != NULLPTR)
{
_mm_storeu_si128((__m128i*)(outBlock),
_mm_xor_si128(
_mm_loadu_si128((const __m128i*)(outBlock)),
_mm_loadu_si128((const __m128i*)(xorBlock))));
}
ARIA_ProcessAndXorBlock_Xor_SSSE3(xorBlock, outBlock, rk, t);
return;
}
else
@ -617,22 +318,17 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b
BigEndianBlock::Put(rk, t)(t[0])(t[1])(t[2])(t[3]);
#endif
#if CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS
#if CRYPTOPP_ARM_NEON_AVAILABLE
if (HasNEON())
{
if (xorBlock != NULLPTR)
{
vst1q_u32(reinterpret_cast<uint32_t*>(outBlock),
veorq_u32(
vld1q_u32((const uint32_t*)outBlock),
vld1q_u32((const uint32_t*)xorBlock)));
}
ARIA_ProcessAndXorBlock_Xor_NEON(xorBlock, outBlock);
}
else
#endif // CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
{
if (xorBlock != NULLPTR)
for (unsigned int n=0; n<16; ++n)
for (unsigned int n=0; n<ARIA::BLOCKSIZE; ++n)
outBlock[n] ^= xorBlock[n];
}
}

168
ariatab.cpp Normal file
View File

@ -0,0 +1,168 @@
// ariatab.cpp - written and placed in the public domain by Jeffrey Walton
#include "pch.h"
#include "config.h"
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(ARIATab)
#define EXPORT_TABLE extern
CRYPTOPP_ALIGN_DATA(16)
EXPORT_TABLE
const word32 S1[256]={
0x00636363,0x007c7c7c,0x00777777,0x007b7b7b,0x00f2f2f2,0x006b6b6b,0x006f6f6f,0x00c5c5c5,
0x00303030,0x00010101,0x00676767,0x002b2b2b,0x00fefefe,0x00d7d7d7,0x00ababab,0x00767676,
0x00cacaca,0x00828282,0x00c9c9c9,0x007d7d7d,0x00fafafa,0x00595959,0x00474747,0x00f0f0f0,
0x00adadad,0x00d4d4d4,0x00a2a2a2,0x00afafaf,0x009c9c9c,0x00a4a4a4,0x00727272,0x00c0c0c0,
0x00b7b7b7,0x00fdfdfd,0x00939393,0x00262626,0x00363636,0x003f3f3f,0x00f7f7f7,0x00cccccc,
0x00343434,0x00a5a5a5,0x00e5e5e5,0x00f1f1f1,0x00717171,0x00d8d8d8,0x00313131,0x00151515,
0x00040404,0x00c7c7c7,0x00232323,0x00c3c3c3,0x00181818,0x00969696,0x00050505,0x009a9a9a,
0x00070707,0x00121212,0x00808080,0x00e2e2e2,0x00ebebeb,0x00272727,0x00b2b2b2,0x00757575,
0x00090909,0x00838383,0x002c2c2c,0x001a1a1a,0x001b1b1b,0x006e6e6e,0x005a5a5a,0x00a0a0a0,
0x00525252,0x003b3b3b,0x00d6d6d6,0x00b3b3b3,0x00292929,0x00e3e3e3,0x002f2f2f,0x00848484,
0x00535353,0x00d1d1d1,0x00000000,0x00ededed,0x00202020,0x00fcfcfc,0x00b1b1b1,0x005b5b5b,
0x006a6a6a,0x00cbcbcb,0x00bebebe,0x00393939,0x004a4a4a,0x004c4c4c,0x00585858,0x00cfcfcf,
0x00d0d0d0,0x00efefef,0x00aaaaaa,0x00fbfbfb,0x00434343,0x004d4d4d,0x00333333,0x00858585,
0x00454545,0x00f9f9f9,0x00020202,0x007f7f7f,0x00505050,0x003c3c3c,0x009f9f9f,0x00a8a8a8,
0x00515151,0x00a3a3a3,0x00404040,0x008f8f8f,0x00929292,0x009d9d9d,0x00383838,0x00f5f5f5,
0x00bcbcbc,0x00b6b6b6,0x00dadada,0x00212121,0x00101010,0x00ffffff,0x00f3f3f3,0x00d2d2d2,
0x00cdcdcd,0x000c0c0c,0x00131313,0x00ececec,0x005f5f5f,0x00979797,0x00444444,0x00171717,
0x00c4c4c4,0x00a7a7a7,0x007e7e7e,0x003d3d3d,0x00646464,0x005d5d5d,0x00191919,0x00737373,
0x00606060,0x00818181,0x004f4f4f,0x00dcdcdc,0x00222222,0x002a2a2a,0x00909090,0x00888888,
0x00464646,0x00eeeeee,0x00b8b8b8,0x00141414,0x00dedede,0x005e5e5e,0x000b0b0b,0x00dbdbdb,
0x00e0e0e0,0x00323232,0x003a3a3a,0x000a0a0a,0x00494949,0x00060606,0x00242424,0x005c5c5c,
0x00c2c2c2,0x00d3d3d3,0x00acacac,0x00626262,0x00919191,0x00959595,0x00e4e4e4,0x00797979,
0x00e7e7e7,0x00c8c8c8,0x00373737,0x006d6d6d,0x008d8d8d,0x00d5d5d5,0x004e4e4e,0x00a9a9a9,
0x006c6c6c,0x00565656,0x00f4f4f4,0x00eaeaea,0x00656565,0x007a7a7a,0x00aeaeae,0x00080808,
0x00bababa,0x00787878,0x00252525,0x002e2e2e,0x001c1c1c,0x00a6a6a6,0x00b4b4b4,0x00c6c6c6,
0x00e8e8e8,0x00dddddd,0x00747474,0x001f1f1f,0x004b4b4b,0x00bdbdbd,0x008b8b8b,0x008a8a8a,
0x00707070,0x003e3e3e,0x00b5b5b5,0x00666666,0x00484848,0x00030303,0x00f6f6f6,0x000e0e0e,
0x00616161,0x00353535,0x00575757,0x00b9b9b9,0x00868686,0x00c1c1c1,0x001d1d1d,0x009e9e9e,
0x00e1e1e1,0x00f8f8f8,0x00989898,0x00111111,0x00696969,0x00d9d9d9,0x008e8e8e,0x00949494,
0x009b9b9b,0x001e1e1e,0x00878787,0x00e9e9e9,0x00cecece,0x00555555,0x00282828,0x00dfdfdf,
0x008c8c8c,0x00a1a1a1,0x00898989,0x000d0d0d,0x00bfbfbf,0x00e6e6e6,0x00424242,0x00686868,
0x00414141,0x00999999,0x002d2d2d,0x000f0f0f,0x00b0b0b0,0x00545454,0x00bbbbbb,0x00161616
};
CRYPTOPP_ALIGN_DATA(16)
EXPORT_TABLE
const word32 S2[256]={
0xe200e2e2,0x4e004e4e,0x54005454,0xfc00fcfc,0x94009494,0xc200c2c2,0x4a004a4a,0xcc00cccc,
0x62006262,0x0d000d0d,0x6a006a6a,0x46004646,0x3c003c3c,0x4d004d4d,0x8b008b8b,0xd100d1d1,
0x5e005e5e,0xfa00fafa,0x64006464,0xcb00cbcb,0xb400b4b4,0x97009797,0xbe00bebe,0x2b002b2b,
0xbc00bcbc,0x77007777,0x2e002e2e,0x03000303,0xd300d3d3,0x19001919,0x59005959,0xc100c1c1,
0x1d001d1d,0x06000606,0x41004141,0x6b006b6b,0x55005555,0xf000f0f0,0x99009999,0x69006969,
0xea00eaea,0x9c009c9c,0x18001818,0xae00aeae,0x63006363,0xdf00dfdf,0xe700e7e7,0xbb00bbbb,
0x00000000,0x73007373,0x66006666,0xfb00fbfb,0x96009696,0x4c004c4c,0x85008585,0xe400e4e4,
0x3a003a3a,0x09000909,0x45004545,0xaa00aaaa,0x0f000f0f,0xee00eeee,0x10001010,0xeb00ebeb,
0x2d002d2d,0x7f007f7f,0xf400f4f4,0x29002929,0xac00acac,0xcf00cfcf,0xad00adad,0x91009191,
0x8d008d8d,0x78007878,0xc800c8c8,0x95009595,0xf900f9f9,0x2f002f2f,0xce00cece,0xcd00cdcd,
0x08000808,0x7a007a7a,0x88008888,0x38003838,0x5c005c5c,0x83008383,0x2a002a2a,0x28002828,
0x47004747,0xdb00dbdb,0xb800b8b8,0xc700c7c7,0x93009393,0xa400a4a4,0x12001212,0x53005353,
0xff00ffff,0x87008787,0x0e000e0e,0x31003131,0x36003636,0x21002121,0x58005858,0x48004848,
0x01000101,0x8e008e8e,0x37003737,0x74007474,0x32003232,0xca00caca,0xe900e9e9,0xb100b1b1,
0xb700b7b7,0xab00abab,0x0c000c0c,0xd700d7d7,0xc400c4c4,0x56005656,0x42004242,0x26002626,
0x07000707,0x98009898,0x60006060,0xd900d9d9,0xb600b6b6,0xb900b9b9,0x11001111,0x40004040,
0xec00ecec,0x20002020,0x8c008c8c,0xbd00bdbd,0xa000a0a0,0xc900c9c9,0x84008484,0x04000404,
0x49004949,0x23002323,0xf100f1f1,0x4f004f4f,0x50005050,0x1f001f1f,0x13001313,0xdc00dcdc,
0xd800d8d8,0xc000c0c0,0x9e009e9e,0x57005757,0xe300e3e3,0xc300c3c3,0x7b007b7b,0x65006565,
0x3b003b3b,0x02000202,0x8f008f8f,0x3e003e3e,0xe800e8e8,0x25002525,0x92009292,0xe500e5e5,
0x15001515,0xdd00dddd,0xfd00fdfd,0x17001717,0xa900a9a9,0xbf00bfbf,0xd400d4d4,0x9a009a9a,
0x7e007e7e,0xc500c5c5,0x39003939,0x67006767,0xfe00fefe,0x76007676,0x9d009d9d,0x43004343,
0xa700a7a7,0xe100e1e1,0xd000d0d0,0xf500f5f5,0x68006868,0xf200f2f2,0x1b001b1b,0x34003434,
0x70007070,0x05000505,0xa300a3a3,0x8a008a8a,0xd500d5d5,0x79007979,0x86008686,0xa800a8a8,
0x30003030,0xc600c6c6,0x51005151,0x4b004b4b,0x1e001e1e,0xa600a6a6,0x27002727,0xf600f6f6,
0x35003535,0xd200d2d2,0x6e006e6e,0x24002424,0x16001616,0x82008282,0x5f005f5f,0xda00dada,
0xe600e6e6,0x75007575,0xa200a2a2,0xef00efef,0x2c002c2c,0xb200b2b2,0x1c001c1c,0x9f009f9f,
0x5d005d5d,0x6f006f6f,0x80008080,0x0a000a0a,0x72007272,0x44004444,0x9b009b9b,0x6c006c6c,
0x90009090,0x0b000b0b,0x5b005b5b,0x33003333,0x7d007d7d,0x5a005a5a,0x52005252,0xf300f3f3,
0x61006161,0xa100a1a1,0xf700f7f7,0xb000b0b0,0xd600d6d6,0x3f003f3f,0x7c007c7c,0x6d006d6d,
0xed00eded,0x14001414,0xe000e0e0,0xa500a5a5,0x3d003d3d,0x22002222,0xb300b3b3,0xf800f8f8,
0x89008989,0xde00dede,0x71007171,0x1a001a1a,0xaf00afaf,0xba00baba,0xb500b5b5,0x81008181
};
CRYPTOPP_ALIGN_DATA(16)
EXPORT_TABLE
const word32 X1[256]={
0x52520052,0x09090009,0x6a6a006a,0xd5d500d5,0x30300030,0x36360036,0xa5a500a5,0x38380038,
0xbfbf00bf,0x40400040,0xa3a300a3,0x9e9e009e,0x81810081,0xf3f300f3,0xd7d700d7,0xfbfb00fb,
0x7c7c007c,0xe3e300e3,0x39390039,0x82820082,0x9b9b009b,0x2f2f002f,0xffff00ff,0x87870087,
0x34340034,0x8e8e008e,0x43430043,0x44440044,0xc4c400c4,0xdede00de,0xe9e900e9,0xcbcb00cb,
0x54540054,0x7b7b007b,0x94940094,0x32320032,0xa6a600a6,0xc2c200c2,0x23230023,0x3d3d003d,
0xeeee00ee,0x4c4c004c,0x95950095,0x0b0b000b,0x42420042,0xfafa00fa,0xc3c300c3,0x4e4e004e,
0x08080008,0x2e2e002e,0xa1a100a1,0x66660066,0x28280028,0xd9d900d9,0x24240024,0xb2b200b2,
0x76760076,0x5b5b005b,0xa2a200a2,0x49490049,0x6d6d006d,0x8b8b008b,0xd1d100d1,0x25250025,
0x72720072,0xf8f800f8,0xf6f600f6,0x64640064,0x86860086,0x68680068,0x98980098,0x16160016,
0xd4d400d4,0xa4a400a4,0x5c5c005c,0xcccc00cc,0x5d5d005d,0x65650065,0xb6b600b6,0x92920092,
0x6c6c006c,0x70700070,0x48480048,0x50500050,0xfdfd00fd,0xeded00ed,0xb9b900b9,0xdada00da,
0x5e5e005e,0x15150015,0x46460046,0x57570057,0xa7a700a7,0x8d8d008d,0x9d9d009d,0x84840084,
0x90900090,0xd8d800d8,0xabab00ab,0x00000000,0x8c8c008c,0xbcbc00bc,0xd3d300d3,0x0a0a000a,
0xf7f700f7,0xe4e400e4,0x58580058,0x05050005,0xb8b800b8,0xb3b300b3,0x45450045,0x06060006,
0xd0d000d0,0x2c2c002c,0x1e1e001e,0x8f8f008f,0xcaca00ca,0x3f3f003f,0x0f0f000f,0x02020002,
0xc1c100c1,0xafaf00af,0xbdbd00bd,0x03030003,0x01010001,0x13130013,0x8a8a008a,0x6b6b006b,
0x3a3a003a,0x91910091,0x11110011,0x41410041,0x4f4f004f,0x67670067,0xdcdc00dc,0xeaea00ea,
0x97970097,0xf2f200f2,0xcfcf00cf,0xcece00ce,0xf0f000f0,0xb4b400b4,0xe6e600e6,0x73730073,
0x96960096,0xacac00ac,0x74740074,0x22220022,0xe7e700e7,0xadad00ad,0x35350035,0x85850085,
0xe2e200e2,0xf9f900f9,0x37370037,0xe8e800e8,0x1c1c001c,0x75750075,0xdfdf00df,0x6e6e006e,
0x47470047,0xf1f100f1,0x1a1a001a,0x71710071,0x1d1d001d,0x29290029,0xc5c500c5,0x89890089,
0x6f6f006f,0xb7b700b7,0x62620062,0x0e0e000e,0xaaaa00aa,0x18180018,0xbebe00be,0x1b1b001b,
0xfcfc00fc,0x56560056,0x3e3e003e,0x4b4b004b,0xc6c600c6,0xd2d200d2,0x79790079,0x20200020,
0x9a9a009a,0xdbdb00db,0xc0c000c0,0xfefe00fe,0x78780078,0xcdcd00cd,0x5a5a005a,0xf4f400f4,
0x1f1f001f,0xdddd00dd,0xa8a800a8,0x33330033,0x88880088,0x07070007,0xc7c700c7,0x31310031,
0xb1b100b1,0x12120012,0x10100010,0x59590059,0x27270027,0x80800080,0xecec00ec,0x5f5f005f,
0x60600060,0x51510051,0x7f7f007f,0xa9a900a9,0x19190019,0xb5b500b5,0x4a4a004a,0x0d0d000d,
0x2d2d002d,0xe5e500e5,0x7a7a007a,0x9f9f009f,0x93930093,0xc9c900c9,0x9c9c009c,0xefef00ef,
0xa0a000a0,0xe0e000e0,0x3b3b003b,0x4d4d004d,0xaeae00ae,0x2a2a002a,0xf5f500f5,0xb0b000b0,
0xc8c800c8,0xebeb00eb,0xbbbb00bb,0x3c3c003c,0x83830083,0x53530053,0x99990099,0x61610061,
0x17170017,0x2b2b002b,0x04040004,0x7e7e007e,0xbaba00ba,0x77770077,0xd6d600d6,0x26260026,
0xe1e100e1,0x69690069,0x14140014,0x63630063,0x55550055,0x21210021,0x0c0c000c,0x7d7d007d
};
CRYPTOPP_ALIGN_DATA(16)
EXPORT_TABLE
const word32 X2[256]={
0x30303000,0x68686800,0x99999900,0x1b1b1b00,0x87878700,0xb9b9b900,0x21212100,0x78787800,
0x50505000,0x39393900,0xdbdbdb00,0xe1e1e100,0x72727200,0x09090900,0x62626200,0x3c3c3c00,
0x3e3e3e00,0x7e7e7e00,0x5e5e5e00,0x8e8e8e00,0xf1f1f100,0xa0a0a000,0xcccccc00,0xa3a3a300,
0x2a2a2a00,0x1d1d1d00,0xfbfbfb00,0xb6b6b600,0xd6d6d600,0x20202000,0xc4c4c400,0x8d8d8d00,
0x81818100,0x65656500,0xf5f5f500,0x89898900,0xcbcbcb00,0x9d9d9d00,0x77777700,0xc6c6c600,
0x57575700,0x43434300,0x56565600,0x17171700,0xd4d4d400,0x40404000,0x1a1a1a00,0x4d4d4d00,
0xc0c0c000,0x63636300,0x6c6c6c00,0xe3e3e300,0xb7b7b700,0xc8c8c800,0x64646400,0x6a6a6a00,
0x53535300,0xaaaaaa00,0x38383800,0x98989800,0x0c0c0c00,0xf4f4f400,0x9b9b9b00,0xededed00,
0x7f7f7f00,0x22222200,0x76767600,0xafafaf00,0xdddddd00,0x3a3a3a00,0x0b0b0b00,0x58585800,
0x67676700,0x88888800,0x06060600,0xc3c3c300,0x35353500,0x0d0d0d00,0x01010100,0x8b8b8b00,
0x8c8c8c00,0xc2c2c200,0xe6e6e600,0x5f5f5f00,0x02020200,0x24242400,0x75757500,0x93939300,
0x66666600,0x1e1e1e00,0xe5e5e500,0xe2e2e200,0x54545400,0xd8d8d800,0x10101000,0xcecece00,
0x7a7a7a00,0xe8e8e800,0x08080800,0x2c2c2c00,0x12121200,0x97979700,0x32323200,0xababab00,
0xb4b4b400,0x27272700,0x0a0a0a00,0x23232300,0xdfdfdf00,0xefefef00,0xcacaca00,0xd9d9d900,
0xb8b8b800,0xfafafa00,0xdcdcdc00,0x31313100,0x6b6b6b00,0xd1d1d100,0xadadad00,0x19191900,
0x49494900,0xbdbdbd00,0x51515100,0x96969600,0xeeeeee00,0xe4e4e400,0xa8a8a800,0x41414100,
0xdadada00,0xffffff00,0xcdcdcd00,0x55555500,0x86868600,0x36363600,0xbebebe00,0x61616100,
0x52525200,0xf8f8f800,0xbbbbbb00,0x0e0e0e00,0x82828200,0x48484800,0x69696900,0x9a9a9a00,
0xe0e0e000,0x47474700,0x9e9e9e00,0x5c5c5c00,0x04040400,0x4b4b4b00,0x34343400,0x15151500,
0x79797900,0x26262600,0xa7a7a700,0xdedede00,0x29292900,0xaeaeae00,0x92929200,0xd7d7d700,
0x84848400,0xe9e9e900,0xd2d2d200,0xbababa00,0x5d5d5d00,0xf3f3f300,0xc5c5c500,0xb0b0b000,
0xbfbfbf00,0xa4a4a400,0x3b3b3b00,0x71717100,0x44444400,0x46464600,0x2b2b2b00,0xfcfcfc00,
0xebebeb00,0x6f6f6f00,0xd5d5d500,0xf6f6f600,0x14141400,0xfefefe00,0x7c7c7c00,0x70707000,
0x5a5a5a00,0x7d7d7d00,0xfdfdfd00,0x2f2f2f00,0x18181800,0x83838300,0x16161600,0xa5a5a500,
0x91919100,0x1f1f1f00,0x05050500,0x95959500,0x74747400,0xa9a9a900,0xc1c1c100,0x5b5b5b00,
0x4a4a4a00,0x85858500,0x6d6d6d00,0x13131300,0x07070700,0x4f4f4f00,0x4e4e4e00,0x45454500,
0xb2b2b200,0x0f0f0f00,0xc9c9c900,0x1c1c1c00,0xa6a6a600,0xbcbcbc00,0xececec00,0x73737300,
0x90909000,0x7b7b7b00,0xcfcfcf00,0x59595900,0x8f8f8f00,0xa1a1a100,0xf9f9f900,0x2d2d2d00,
0xf2f2f200,0xb1b1b100,0x00000000,0x94949400,0x37373700,0x9f9f9f00,0xd0d0d000,0x2e2e2e00,
0x9c9c9c00,0x6e6e6e00,0x28282800,0x3f3f3f00,0x80808000,0xf0f0f000,0x3d3d3d00,0xd3d3d300,
0x25252500,0x8a8a8a00,0xb5b5b500,0xe7e7e700,0x42424200,0xb3b3b300,0xc7c7c700,0xeaeaea00,
0xf7f7f700,0x4c4c4c00,0x11111100,0x33333300,0x03030300,0xa2a2a200,0xacacac00,0x60606000
};
CRYPTOPP_ALIGN_DATA(16)
EXPORT_TABLE
const word32 KRK[3][4] = {
{0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0},
{0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0},
{0xdb92371d, 0x2126e970, 0x03249775, 0x04e8c90e}
};
NAMESPACE_END
NAMESPACE_END

View File

@ -50,7 +50,7 @@ void AuthenticatedSymmetricCipherBase::SetKey(const byte *userKey, size_t keylen
m_bufferedDataLength = 0;
m_state = State_Start;
SetKeyWithoutResync(userKey, keylength, params);
this->SetKeyWithoutResync(userKey, keylength, params);
m_state = State_KeySet;
size_t length;

View File

@ -506,11 +506,11 @@ void Benchmark2(double t, double hertz)
std::cout << "\n<TBODY style=\"background: white;\">";
{
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_AESNI_AVAILABLE
if (HasCLMUL())
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES)");
else
#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#elif CRYPTOPP_ARM_PMULL_AVAILABLE
if (HasPMULL())
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES)");
else
@ -595,11 +595,11 @@ void Benchmark2(double t, double hertz)
std::cout << "\n<TBODY style=\"background: yellow;\">";
{
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_AESNI_AVAILABLE
if (HasCLMUL())
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM");
else
#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#elif CRYPTOPP_ARM_PMULL_AVAILABLE
if (HasPMULL())
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM");
else

2182
blake2-simd.cpp Normal file

File diff suppressed because it is too large Load Diff

3617
blake2.cpp

File diff suppressed because it is too large Load Diff

343
config.h
View File

@ -80,8 +80,15 @@
// #endif
// File system code to write to GZIP archive.
// http://www.gzip.org/format.txt
#if !defined(GZIP_OS_CODE)
# define GZIP_OS_CODE 0
# if defined(__macintosh__)
# define GZIP_OS_CODE 7
# elif defined(__unix__) || defined(__linux__)
# define GZIP_OS_CODE 3
# else
# define GZIP_OS_CODE 0
# endif
#endif
// Try this if your CPU has 256K internal cache or a slow multiply instruction
@ -386,155 +393,7 @@ NAMESPACE_END
#define CRYPTOPP_UNCAUGHT_EXCEPTION_AVAILABLE
#endif
// Apple's Clang prior to 5.0 cannot handle SSE2 (and Apple does not use LLVM Clang numbering...)
#if defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION < 50000)
# define CRYPTOPP_DISABLE_ASM
#endif
// Sun Studio 12 provides GCC inline assembly, http://blogs.oracle.com/x86be/entry/gcc_style_asm_inlining_support
// We can enable SSE2 for Sun Studio in the makefile with -D__SSE2__, but users may not compile with it.
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(__SSE2__) && defined(__x86_64__) && (__SUNPRO_CC >= 0x5100)
# define __SSE2__ 1
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && ((defined(_MSC_VER) && defined(_M_IX86)) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))))
// C++Builder 2010 does not allow "call label" where label is defined within inline assembly
#define CRYPTOPP_X86_ASM_AVAILABLE
#if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(_MSC_VER) || CRYPTOPP_GCC_VERSION >= 30300 || defined(__SSE2__))
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
#endif
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSE3__) && defined(__SSSE3__)))
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
#endif
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && defined(_MSC_VER) && defined(_M_X64)
#define CRYPTOPP_X64_MASM_AVAILABLE
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && defined(__GNUC__) && defined(__x86_64__)
#define CRYPTOPP_X64_ASM_AVAILABLE
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && (defined(_MSC_VER) || defined(__SSE2__)) && !defined(_M_ARM)
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SSSE3) && !defined(_M_ARM) && (_MSC_VER >= 1500 || (defined(__SSSE3__) && defined(__SSSE3__)))
#define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 0
#endif
// Intrinsics availible in GCC 4.3 (http://gcc.gnu.org/gcc-4.3/changes.html) and
// MSVC 2008 (http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx)
// SunCC could generate SSE4 at 12.1, but the intrinsics are missing until 12.4.
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SSE4) && !defined(_M_ARM) && ((_MSC_VER >= 1500) || (defined(__SSE4_1__) && defined(__SSE4_2__)))
#define CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 0
#endif
// Don't disgorge AES-NI from CLMUL. There will be two to four subtle breaks
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_AESNI) && !defined(_M_ARM) && (_MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110 || (defined(__AES__) && defined(__PCLMUL__)))
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && !defined(_M_ARM) && ((_MSC_VER >= 1900) || defined(__SHA__))
#define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 0
#endif
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
#if !defined(CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM)
# define CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 1
# endif
#endif
// Requires ARMv8 and ACLE 2.0. GCC requires 4.8 and above.
// LLVM Clang requires 3.5. Apple Clang is unknown at the moment.
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_FEATURE_CRC32)
# define CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE 1
# endif
#endif
// Requires ARMv8, ACLE 2.0 and Aarch64. GCC requires 4.8 and above.
// LLVM Clang requires 3.5. Apple Clang does not support it at the moment.
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_FEATURE_CRYPTO) && !defined(__apple_build_version__)
# if defined(__arm64__) || defined(__aarch64__)
# define CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE 1
# endif
# endif
#endif
// Requires ARMv8 and ACLE 2.0. GCC requires 4.8 and above.
// LLVM Clang requires 3.5. Apple Clang is unknown at the moment.
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VERSION >= 1910)
# define CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE 1
# endif
#endif
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
#define CRYPTOPP_BOOL_ALIGN16 1
#else
#define CRYPTOPP_BOOL_ALIGN16 0
#endif
// how to allocate 16-byte aligned memory (for SSE2)
#if defined(_MSC_VER)
#define CRYPTOPP_MM_MALLOC_AVAILABLE
#elif defined(__APPLE__)
#define CRYPTOPP_APPLE_MALLOC_AVAILABLE
#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
#define CRYPTOPP_MALLOC_ALIGNMENT_IS_16
#elif defined(__linux__) || defined(__sun__) || defined(__CYGWIN__)
#define CRYPTOPP_MEMALIGN_AVAILABLE
#else
#define CRYPTOPP_NO_ALIGNED_ALLOC
#endif
// Apple always provides 16-byte aligned, and tells us to use calloc
// http://developer.apple.com/library/mac/documentation/Performance/Conceptual/ManagingMemory/Articles/MemoryAlloc.html
// how to disable inlining
#if defined(_MSC_VER)
# define CRYPTOPP_NOINLINE_DOTDOTDOT
# define CRYPTOPP_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
# define CRYPTOPP_NOINLINE_DOTDOTDOT
# define CRYPTOPP_NOINLINE __attribute__((noinline))
#else
# define CRYPTOPP_NOINLINE_DOTDOTDOT ...
# define CRYPTOPP_NOINLINE
#endif
// How to declare class constants
#if defined(CRYPTOPP_DOXYGEN_PROCESSING)
# define CRYPTOPP_CONSTANT(x) static const int x;
#else
# define CRYPTOPP_CONSTANT(x) enum {x};
#endif
// ***************** Platform and CPU features ********************
// Linux provides X32, which is 32-bit integers, longs and pointers on x86_64 using the full x86_64 register set.
// Detect via __ILP32__ (http://wiki.debian.org/X32Port). However, __ILP32__ shows up in more places than
@ -579,6 +438,190 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_ARM64 0
#endif
#if defined(_MSC_VER) || defined(__BORLANDC__)
# define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY 1
#else
# define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY 1
#endif
// ***************** IA32 CPU features ********************
#if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64)
// Apple Clang prior to 5.0 cannot handle SSE2
#if defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION < 50000)
# define CRYPTOPP_DISABLE_ASM
#endif
// Sun Studio 12 provides GCC inline assembly, http://blogs.oracle.com/x86be/entry/gcc_style_asm_inlining_support
// We can enable SSE2 for Sun Studio in the makefile with -D__SSE2__, but users may not compile with it.
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(__SSE2__) && defined(__x86_64__) && (__SUNPRO_CC >= 0x5100)
# define __SSE2__ 1
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && ((defined(_MSC_VER) && defined(_M_IX86)) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))))
// C++Builder 2010 does not allow "call label" where label is defined within inline assembly
#define CRYPTOPP_X86_ASM_AVAILABLE
#if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(_MSC_VER) || CRYPTOPP_GCC_VERSION >= 30300 || defined(__SSE2__))
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1
#endif
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || defined(__SSSE3__))
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1
#endif
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && defined(_MSC_VER) && defined(_M_X64)
#define CRYPTOPP_X64_MASM_AVAILABLE 1
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && defined(__GNUC__) && defined(__x86_64__)
#define CRYPTOPP_X64_ASM_AVAILABLE 1
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && (defined(_MSC_VER) || defined(__SSE2__))
#define CRYPTOPP_SSE2_AVAILABLE 1
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SSSE3)
# if defined(__SSSE3__) || (_MSC_VER >= 1500) || (CRYPTOPP_GCC_VERSION >= 40300)
#define CRYPTOPP_SSSE3_AVAILABLE 1
# endif
#endif
// Intrinsics availible in GCC 4.3 (http://gcc.gnu.org/gcc-4.3/changes.html) and
// MSVC 2008 (http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx)
// SunCC could generate SSE4 at 12.1, but the intrinsics are missing until 12.4.
#if !defined(CRYPTOPP_DISABLE_SSE4) && defined(CRYPTOPP_SSSE3_AVAILABLE) && \
(defined(__SSE4_1__) || (CRYPTOPP_MSC_VERSION >= 1500) || \
(CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1000) || \
(CRYPTOPP_LLVM_CLANG_VERSION >= 20300) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40000))
#define CRYPTOPP_SSE41_AVAILABLE 1
#endif
#if !defined(CRYPTOPP_DISABLE_SSE4) && defined(CRYPTOPP_SSSE3_AVAILABLE) && \
(defined(__SSE4_2__) || (CRYPTOPP_MSC_VERSION >= 1500) || \
(CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1000) || \
(CRYPTOPP_LLVM_CLANG_VERSION >= 20300) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40000))
#define CRYPTOPP_SSE42_AVAILABLE 1
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_CLMUL) && \
(defined(__PCLMUL__) || (_MSC_FULL_VER >= 150030729) || \
(CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1110) || \
(CRYPTOPP_LLVM_CLANG_VERSION >= 30200) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40300))
#define CRYPTOPP_CLMUL_AVAILABLE 1
#endif
#if !defined(CRYPTOPP_DISABLE_SSE4) && defined(CRYPTOPP_SSSE3_AVAILABLE) && \
(defined(__AES__) || (_MSC_FULL_VER >= 150030729) || \
(CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1110) || \
(CRYPTOPP_LLVM_CLANG_VERSION >= 30200) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40300))
#define CRYPTOPP_AESNI_AVAILABLE 1
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && \
(defined(__SHA__) || (CRYPTOPP_MSC_VERSION >= 1900) || \
(CRYPTOPP_GCC_VERSION >= 40900) || (__INTEL_COMPILER >= 1300) || \
(CRYPTOPP_LLVM_CLANG_VERSION >= 30400) || (CRYPTOPP_APPLE_CLANG_VERSION >= 50100))
#define CRYPTOPP_SHANI_AVAILABLE 1
#endif
#endif // X86, X32, X64
// ***************** ARM CPU features ********************
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
#if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_NEON__) || defined(__ARM_FEATURE_NEON) || (CRYPTOPP_MSC_VERSION >= 1700) || \
(CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
# define CRYPTOPP_ARM_NEON_AVAILABLE 1
# endif
#endif
// Requires ARMv8 and ACLE 2.0. GCC requires 4.8 and above.
// LLVM Clang requires 3.5. Apple Clang is unknown at the moment.
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_ARM_CRC32_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__)
# if defined(__ARM_FEATURE_CRC32) || (CRYPTOPP_MSC_VERSION >= 1910) || \
defined(__aarch32__) || defined(__aarch64__)
# define CRYPTOPP_ARM_CRC32_AVAILABLE 1
# endif
#endif
// Requires ARMv8 and ACLE 2.0. GCC requires 4.8 and above.
// LLVM Clang requires 3.5. Apple Clang is unknown at the moment.
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_ARM_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__)
# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VERSION >= 1910) || \
defined(__aarch32__) || defined(__aarch64__)
# define CRYPTOPP_ARM_PMULL_AVAILABLE 1
# endif
#endif
// Requires ARMv8 and ACLE 2.0. GCC requires 4.8 and above.
// LLVM Clang requires 3.5. Apple Clang is unknown at the moment.
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_ARM_CRYPTO_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VERSION >= 1910) || \
defined(__aarch32__) || defined(__aarch64__)
# define CRYPTOPP_ARM_AES_AVAILABLE 1
# define CRYPTOPP_ARM_SHA_AVAILABLE 1
# define CRYPTOPP_ARM_CRYPTO_AVAILABLE 1
# endif
#endif
#endif // ARM32, ARM64
// ***************** Miscellaneous ********************
#if CRYPTOPP_SSE2_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
#define CRYPTOPP_BOOL_ALIGN16 1
#else
#define CRYPTOPP_BOOL_ALIGN16 0
#endif
// how to allocate 16-byte aligned memory (for SSE2)
#if defined(_MSC_VER)
#define CRYPTOPP_MM_MALLOC_AVAILABLE
#elif defined(__APPLE__)
#define CRYPTOPP_APPLE_MALLOC_AVAILABLE
#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
#define CRYPTOPP_MALLOC_ALIGNMENT_IS_16
#elif defined(__linux__) || defined(__sun__) || defined(__CYGWIN__)
#define CRYPTOPP_MEMALIGN_AVAILABLE
#else
#define CRYPTOPP_NO_ALIGNED_ALLOC
#endif
// Apple always provides 16-byte aligned, and tells us to use calloc
// http://developer.apple.com/library/mac/documentation/Performance/Conceptual/ManagingMemory/Articles/MemoryAlloc.html
// how to disable inlining
#if defined(_MSC_VER)
# define CRYPTOPP_NOINLINE_DOTDOTDOT
# define CRYPTOPP_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
# define CRYPTOPP_NOINLINE_DOTDOTDOT
# define CRYPTOPP_NOINLINE __attribute__((noinline))
#else
# define CRYPTOPP_NOINLINE_DOTDOTDOT ...
# define CRYPTOPP_NOINLINE
#endif
// How to declare class constants
#if defined(CRYPTOPP_DOXYGEN_PROCESSING)
# define CRYPTOPP_CONSTANT(x) static const int x;
#else
# define CRYPTOPP_CONSTANT(x) enum {x};
#endif
// ***************** Initialization and Constructor priorities ********************
// CRYPTOPP_INIT_PRIORITY attempts to manage initialization of C++ static objects.

646
cpu.cpp
View File

@ -13,13 +13,9 @@
#include "misc.h"
#include "stdcpp.h"
#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
# include <arm_neon.h>
#endif
#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#include <signal.h>
#include <setjmp.h>
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>
# include <setjmp.h>
#endif
NAMESPACE_BEGIN(CryptoPP)
@ -30,13 +26,26 @@ extern "C" {
};
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
// ***************** IA-32 CPUs ********************
#ifdef CRYPTOPP_CPUID_AVAILABLE
#if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
#if _MSC_VER >= 1500
bool CpuId(word32 input, word32 output[4])
inline bool CpuId(word32 func, word32 subfunc, word32 output[4])
{
__cpuid((int *)output, input);
__cpuidex((int *)output, func, subfunc);
return true;
}
#elif _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
inline bool CpuId(word32 func, word32 subfunc, word32 output[4])
{
if (subfunc != 0)
return false;
__cpuid((int *)output, func);
return true;
}
@ -59,15 +68,15 @@ extern "C"
}
#endif
bool CpuId(word32 input, word32 output[4])
inline bool CpuId(word32 func, word32 subfunc, word32 output[4])
{
#if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
__try
{
__asm
{
mov eax, input
mov ecx, 0
mov eax, func
mov ecx, subfunc
cpuid
mov edi, output
mov [edi], eax
@ -116,7 +125,7 @@ bool CpuId(word32 input, word32 output[4])
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
# endif
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
: "a" (input), "c" (0)
: "a" (func), "c" (subfunc)
: "cc"
);
}
@ -132,7 +141,7 @@ bool CpuId(word32 input, word32 output[4])
#endif
static bool TrySSE2()
static bool CPU_ProbeSSE2()
{
#if CRYPTOPP_BOOL_X64
return true;
@ -141,7 +150,7 @@ static bool TrySSE2()
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
AS2(por xmm0, xmm0) // executing SSE2 instruction
#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#elif CRYPTOPP_SSE2_AVAILABLE
__m128i x = _mm_setzero_si128();
return _mm_cvtsi128_si32(x) == 0;
#endif
@ -173,7 +182,7 @@ static bool TrySSE2()
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
__asm __volatile ("por %xmm0, %xmm0");
#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#elif CRYPTOPP_SSE2_AVAILABLE
__m128i x = _mm_setzero_si128();
result = _mm_cvtsi128_si32(x) == 0;
#endif
@ -189,8 +198,9 @@ static bool TrySSE2()
}
bool CRYPTOPP_SECTION_INIT g_x86DetectionDone = false;
bool CRYPTOPP_SECTION_INIT g_hasMMX = false, CRYPTOPP_SECTION_INIT g_hasISSE = false, CRYPTOPP_SECTION_INIT g_hasSSE2 = false, CRYPTOPP_SECTION_INIT g_hasSSSE3 = false;
bool CRYPTOPP_SECTION_INIT g_hasSSE4 = false, CRYPTOPP_SECTION_INIT g_hasAESNI = false, CRYPTOPP_SECTION_INIT g_hasCLMUL = false, CRYPTOPP_SECTION_INIT g_hasSHA = false;
bool CRYPTOPP_SECTION_INIT CRYPTOPP_SECTION_INIT g_hasSSE2 = false, CRYPTOPP_SECTION_INIT g_hasSSSE3 = false;
bool CRYPTOPP_SECTION_INIT g_hasSSE41 = false, CRYPTOPP_SECTION_INIT g_hasSSE42 = false;
bool CRYPTOPP_SECTION_INIT g_hasAESNI = false, CRYPTOPP_SECTION_INIT g_hasCLMUL = false, CRYPTOPP_SECTION_INIT g_hasSHA = false;
bool CRYPTOPP_SECTION_INIT g_hasRDRAND = false, CRYPTOPP_SECTION_INIT g_hasRDSEED = false, CRYPTOPP_SECTION_INIT g_isP4 = false;
bool CRYPTOPP_SECTION_INIT g_hasPadlockRNG = false, CRYPTOPP_SECTION_INIT g_hasPadlockACE = false, CRYPTOPP_SECTION_INIT g_hasPadlockACE2 = false;
bool CRYPTOPP_SECTION_INIT g_hasPadlockPHE = false, CRYPTOPP_SECTION_INIT g_hasPadlockPMM = false;
@ -224,31 +234,19 @@ void DetectX86Features()
{
// Coverity finding CID 171239...
word32 cpuid0[4]={0}, cpuid1[4]={0}, cpuid2[4]={0};
if (!CpuId(0, cpuid0))
if (!CpuId(0, 0, cpuid0))
return;
if (!CpuId(1, cpuid1))
if (!CpuId(1, 0, cpuid1))
return;
g_hasMMX = (cpuid1[3] & (1 << 23)) != 0;
if ((cpuid1[3] & (1 << 26)) != 0)
g_hasSSE2 = TrySSE2();
g_hasSSE2 = CPU_ProbeSSE2();
g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9));
g_hasSSE4 = g_hasSSE2 && ((cpuid1[2] & (1<<19)) && (cpuid1[2] & (1<<20)));
g_hasSSE41 = g_hasSSE2 && (cpuid1[2] & (1<<19));
g_hasSSE42 = g_hasSSE2 && (cpuid1[2] & (1<<20));
g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25));
g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1));
if ((cpuid1[3] & (1 << 25)) != 0)
g_hasISSE = true;
else
{
CpuId(0x080000000, cpuid2);
if (cpuid2[0] >= 0x080000001)
{
CpuId(0x080000001, cpuid2);
g_hasISSE = (cpuid2[3] & (1 << 22)) != 0;
}
}
if (IsIntel(cpuid0))
{
enum { RDRAND_FLAG = (1 << 30) };
@ -259,9 +257,9 @@ void DetectX86Features()
g_cacheLineSize = 8 * GETBYTE(cpuid1[1], 1);
g_hasRDRAND = !!(cpuid1[2] /*ECX*/ & RDRAND_FLAG);
if (cpuid0[0] /*EAX*/ >= 7)
if (cpuid1[0] /*EAX*/ >= 7)
{
if (CpuId(7, cpuid2))
if (CpuId(7, 0, cpuid2))
{
g_hasRDSEED = !!(cpuid2[1] /*EBX*/ & RDSEED_FLAG);
g_hasSHA = !!(cpuid2[1] /*EBX*/ & SHA_FLAG);
@ -274,13 +272,13 @@ void DetectX86Features()
enum { RDSEED_FLAG = (1 << 18) };
enum { SHA_FLAG = (1 << 29) };
CpuId(0x80000005, cpuid2);
CpuId(0x80000005, 0, cpuid2);
g_cacheLineSize = GETBYTE(cpuid2[2], 0);
g_hasRDRAND = !!(cpuid1[2] /*ECX*/ & RDRAND_FLAG);
if (cpuid0[0] /*EAX*/ >= 7)
if (cpuid1[0] /*EAX*/ >= 7)
{
if (CpuId(7, cpuid2))
if (CpuId(7, 0, cpuid2))
{
g_hasRDSEED = !!(cpuid2[1] /*EBX*/ & RDSEED_FLAG);
g_hasSHA = !!(cpuid2[1] /*EBX*/ & SHA_FLAG);
@ -295,16 +293,16 @@ void DetectX86Features()
enum { PHE_FLAGS = (0x3 << 10) };
enum { PMM_FLAGS = (0x3 << 12) };
CpuId(0xC0000000, cpuid0);
if (cpuid0[0] >= 0xC0000001)
CpuId(0xC0000000, 0, cpuid2);
if (cpuid2[0] >= 0xC0000001)
{
// Extended features available
CpuId(0xC0000001, cpuid0);
g_hasPadlockRNG = !!(cpuid0[3] /*EDX*/ & RNG_FLAGS);
g_hasPadlockACE = !!(cpuid0[3] /*EDX*/ & ACE_FLAGS);
g_hasPadlockACE2 = !!(cpuid0[3] /*EDX*/ & ACE2_FLAGS);
g_hasPadlockPHE = !!(cpuid0[3] /*EDX*/ & PHE_FLAGS);
g_hasPadlockPMM = !!(cpuid0[3] /*EDX*/ & PMM_FLAGS);
CpuId(0xC0000001, 0, cpuid2);
g_hasPadlockRNG = !!(cpuid2[3] /*EDX*/ & RNG_FLAGS);
g_hasPadlockACE = !!(cpuid2[3] /*EDX*/ & ACE_FLAGS);
g_hasPadlockACE2 = !!(cpuid2[3] /*EDX*/ & ACE2_FLAGS);
g_hasPadlockPHE = !!(cpuid2[3] /*EDX*/ & PHE_FLAGS);
g_hasPadlockPMM = !!(cpuid2[3] /*EDX*/ & PMM_FLAGS);
}
}
@ -314,436 +312,196 @@ void DetectX86Features()
g_x86DetectionDone = true;
}
// ***************** ARM-32, Aarch32 and Aarch64 CPUs ********************
#elif (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
// The ARM equivalent of CPUID probing is reading a MSR. The code requires Exception Level 1 (EL1) and above, but user space runs at EL0.
// Attempting to run the code results in a SIGILL and termination.
//
// #if defined(__arm64__) || defined(__aarch64__)
// word64 caps = 0; // Read ID_AA64ISAR0_EL1
// __asm __volatile("mrs %0, " "id_aa64isar0_el1" : "=r" (caps));
// #elif defined(__arm__) || defined(__aarch32__)
// word32 caps = 0; // Read ID_ISAR5_EL1
// __asm __volatile("mrs %0, " "id_isar5_el1" : "=r" (caps));
// #endif
//
// The following does not work well either. Its appears to be missing constants, and it does not detect Aarch32 execution environments on Aarch64
// http://community.arm.com/groups/android-community/blog/2014/10/10/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
//
#if defined(__linux__)
# include <sys/auxv.h>
# ifndef HWCAP_ASIMD
# define HWCAP_ASIMD (1 << 1)
# endif
# ifndef HWCAP_ARM_NEON
# define HWCAP_ARM_NEON 4096
# endif
# ifndef HWCAP_CRC32
# define HWCAP_CRC32 (1 << 7)
# endif
# ifndef HWCAP2_CRC32
# define HWCAP2_CRC32 (1 << 4)
# endif
# ifndef HWCAP_PMULL
# define HWCAP_PMULL (1 << 4)
# endif
# ifndef HWCAP2_PMULL
# define HWCAP2_PMULL (1 << 1)
# endif
# ifndef HWCAP_AES
# define HWCAP_AES (1 << 3)
# endif
# ifndef HWCAP2_AES
# define HWCAP2_AES (1 << 0)
# endif
# ifndef HWCAP_SHA1
# define HWCAP_SHA1 (1 << 5)
# endif
# ifndef HWCAP_SHA2
# define HWCAP_SHA2 (1 << 6)
# endif
# ifndef HWCAP2_SHA1
# define HWCAP2_SHA1 (1 << 2)
# endif
# ifndef HWCAP2_SHA2
# define HWCAP2_SHA2 (1 << 3)
# endif
#endif
#if defined(__APPLE__) && defined(__aarch64__)
# include <sys/utsname.h>
#endif
bool CRYPTOPP_SECTION_INIT g_ArmDetectionDone = false;
bool CRYPTOPP_SECTION_INIT g_hasNEON = false, CRYPTOPP_SECTION_INIT g_hasPMULL = false, CRYPTOPP_SECTION_INIT g_hasCRC32 = false;
bool CRYPTOPP_SECTION_INIT g_hasAES = false, CRYPTOPP_SECTION_INIT g_hasSHA1 = false, CRYPTOPP_SECTION_INIT g_hasSHA2 = false;
word32 CRYPTOPP_SECTION_INIT g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
extern "C"
// ARM does not have an unprivliged equivalent to CPUID on IA-32. We have to jump through some
// hoops to detect features on a wide array of platforms. Our strategy is two part. First,
// attempt to *Query* the OS for a feature, like using getauxval on Linux. If that fails,
// then *Probe* the cpu executing an instruction and an observe a SIGILL if unsupported.
// The probes are in source files where compilation options like -march=armv8-a+crc make
// intrinsics available. They are expensive when compared to a standard OS feature query.
// Always perform the feature quesry first. For Linux see
// http://sourceware.org/ml/libc-help/2017-08/msg00012.html
// Avoid probes on Apple platforms because Apple's signal handling for SIGILLs appears broken.
// We are trying to figure out a way to feature test without probes. Also see
// http://stackoverflow.com/a/11197770/608639 and
// http://gist.github.com/erkanyildiz/390a480f27e86f8cd6ba
extern bool CPU_ProbeNEON();
extern bool CPU_ProbeCRC32();
extern bool CPU_ProbeAES();
extern bool CPU_ProbeSHA1();
extern bool CPU_ProbeSHA2();
extern bool CPU_ProbePMULL();
inline bool CPU_QueryNEON()
{
static jmp_buf s_jmpNoNEON;
static void SigIllHandlerNEON(int)
{
longjmp(s_jmpNoNEON, 1);
}
static jmp_buf s_jmpNoPMULL;
static void SigIllHandlerPMULL(int)
{
longjmp(s_jmpNoPMULL, 1);
}
static jmp_buf s_jmpNoCRC32;
static void SigIllHandlerCRC32(int)
{
longjmp(s_jmpNoCRC32, 1);
}
static jmp_buf s_jmpNoAES;
static void SigIllHandlerAES(int)
{
longjmp(s_jmpNoAES, 1);
}
static jmp_buf s_jmpNoSHA1;
static void SigIllHandlerSHA1(int)
{
longjmp(s_jmpNoSHA1, 1);
}
static jmp_buf s_jmpNoSHA2;
static void SigIllHandlerSHA2(int)
{
longjmp(s_jmpNoSHA2, 1);
}
};
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
static bool TryNEON()
{
#if (CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
uint32_t v1[4] = {1,1,1,1};
uint32x4_t x1 = vld1q_u32(v1);
uint64_t v2[2] = {1,1};
uint64x2_t x2 = vld1q_u64(v2);
uint32x4_t x3 = vdupq_n_u32(2);
x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
uint64x2_t x4 = vdupq_n_u64(2);
x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);
result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerNEON);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpNoNEON))
result = false;
else
{
uint32_t v1[4] = {1,1,1,1};
uint32x4_t x1 = vld1q_u32(v1);
uint64_t v2[2] = {1,1};
uint64x2_t x2 = vld1q_u64(v2);
uint32x4_t x3 = {0,0,0,0};
x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
uint64x2_t x4 = {0,0};
x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);
// Hack... GCC optimizes away the code and returns true
result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
#if defined(__ANDROID__) && (defined(__aarch32__) || defined(__aarch64__))
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
return true;
#elif defined(__ANDROID__) && defined(__arm__)
if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
return true;
#elif defined(__linux__) && defined(__aarch64__)
if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
return true;
#elif defined(__linux__) && defined(__aarch32__)
if (getauxval(AT_HWCAP2) & HWCAP2_ASIMD)
return true;
#elif defined(__linux__) && defined(__arm__)
if (getauxval(AT_HWCAP) & HWCAP_ARM_NEON)
return true;
#endif
return false;
#endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
}
static bool TryPMULL()
inline bool CPU_QueryCRC32()
{
#if (CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = vmull_p64(a1, b1);
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerPMULL);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpNoPMULL))
result = false;
else
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = vmull_p64(a1, b1);
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__))
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32)
return true;
#elif defined(__linux__) && defined(__aarch64__)
if (getauxval(AT_HWCAP) & HWCAP_CRC32)
return true;
#elif defined(__linux__) && defined(__aarch32__)
if (getauxval(AT_HWCAP2) & HWCAP2_CRC32)
return true;
#endif
return false;
#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
}
static bool TryCRC32()
inline bool CPU_QueryPMULL()
{
#if (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
word32 w=0, x=1; word16 y=2; byte z=3;
w = __crc32cw(w,x);
w = __crc32ch(w,y);
w = __crc32cb(w,z);
result = !!w;
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerCRC32);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpNoCRC32))
result = false;
else
{
word32 w=0, x=1; word16 y=2; byte z=3;
w = __crc32cw(w,x);
w = __crc32ch(w,y);
w = __crc32cb(w,z);
// Hack... GCC optimizes away the code and returns true
result = !!w;
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__))
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_PMULL)
return true;
#elif defined(__linux__) && defined(__aarch64__)
if (getauxval(AT_HWCAP) & HWCAP_PMULL)
return true;
#elif defined(__linux__) && defined(__aarch32__)
if (getauxval(AT_HWCAP2) & HWCAP2_PMULL)
return true;
#endif
return false;
#endif // CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE
}
static bool TryAES()
inline bool CPU_QueryAES()
{
#if (CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__))
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_AES)
return true;
#elif defined(__linux__) && defined(__aarch64__)
if (getauxval(AT_HWCAP) & HWCAP_AES)
return true;
#elif defined(__linux__) && defined(__aarch32__)
if (getauxval(AT_HWCAP2) & HWCAP2_AES)
return true;
#elif defined(__APPLE__)
struct utsname systemInfo;
systemInfo.machine[0] = '\0';
uname(&systemInfo);
std::string machine(systemInfo.machine);
if (machine.substr(0, 7) == "iPhone6" || machine.substr(0, 7) == "iPhone7" ||
machine.substr(0, 7) == "iPhone8" || machine.substr(0, 7) == "iPhone9" ||
machine.substr(0, 5) == "iPad4" || machine.substr(0, 5) == "iPad5" ||
machine.substr(0, 5) == "iPad6" || machine.substr(0, 5) == "iPad7")
{
// AES encrypt and decrypt
uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
uint8x16_t r1 = vaeseq_u8(data, key);
uint8x16_t r2 = vaesdq_u8(data, key);
result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
return true;
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerAES);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpNoAES))
result = false;
else
{
uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
uint8x16_t r1 = vaeseq_u8(data, key);
uint8x16_t r2 = vaesdq_u8(data, key);
// Hack... GCC optimizes away the code and returns true
result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
#endif
return false;
#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
}
static bool TrySHA1()
inline bool CPU_QuerySHA1()
{
#if (CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerSHA1);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpNoSHA1))
result = false;
else
{
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
// Hack... GCC optimizes away the code and returns true
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__))
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_SHA1)
return true;
#elif defined(__linux__) && defined(__aarch64__)
if (getauxval(AT_HWCAP) & HWCAP_SHA1)
return true;
#elif defined(__linux__) && defined(__aarch32__)
if (getauxval(AT_HWCAP2) & HWCAP2_SHA1)
return true;
#endif
return false;
#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
}
static bool TrySHA2()
inline bool CPU_QuerySHA2()
{
#if (CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerSHA2);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpNoSHA2))
result = false;
else
{
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
// Hack... GCC optimizes away the code and returns true
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__))
if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_SHA2)
return true;
#elif defined(__linux__) && defined(__aarch64__)
if (getauxval(AT_HWCAP) & HWCAP_SHA2)
return true;
#elif defined(__linux__) && defined(__aarch32__)
if (getauxval(AT_HWCAP2) & HWCAP2_SHA2)
return true;
#endif
return false;
#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
}
void DetectArmFeatures()
{
g_hasNEON = TryNEON();
g_hasPMULL = TryPMULL();
g_hasCRC32 = TryCRC32();
g_hasAES = TryAES();
g_hasSHA1 = TrySHA1();
g_hasSHA2 = TrySHA2();
g_hasNEON = CPU_QueryNEON() || CPU_ProbeNEON();
g_hasCRC32 = CPU_QueryCRC32() || CPU_ProbeCRC32();
g_hasPMULL = CPU_QueryPMULL() || CPU_ProbePMULL();
g_hasAES = CPU_QueryAES() || CPU_ProbeAES();
g_hasSHA1 = CPU_QuerySHA1() || CPU_ProbeSHA1();
g_hasSHA2 = CPU_QuerySHA2() || CPU_ProbeSHA2();
g_ArmDetectionDone = true;
}

171
cpu.h
View File

@ -17,70 +17,6 @@
# pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
// ARM32 and ARM64 Headers
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
# if defined(__GNUC__)
# include <stdint.h>
# endif
# if defined(__ARM_NEON) || defined(__aarch32__) || defined(__aarch64__) || defined(_MSC_VER)
# include <arm_neon.h>
# endif
# if defined(__GNUC__) && !defined(__apple_build_version__)
# if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRC32) || defined(__ARM_FEATURE_CRYPTO)
# include <arm_acle.h>
# endif
# endif
#endif // ARM32 and ARM64 Headers
// Used when supplying ASM due to missing intrinsics
#if defined(__clang__)
# define GCC_INLINE inline
# define GCC_INLINE_ATTRIB __attribute__((__gnu_inline__, __always_inline__))
#elif (CRYPTOPP_GCC_VERSION >= 30300) || defined(__INTEL_COMPILER)
# define GCC_INLINE __inline
# define GCC_INLINE_ATTRIB __attribute__((__gnu_inline__, __always_inline__, __artificial__))
#else
# define GCC_INLINE inline
# define GCC_INLINE_ATTRIB
# endif
// X86/X64/X32 Headers
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
// GCC X86 super-include
#if (CRYPTOPP_GCC_VERSION >= 40800)
# include <x86intrin.h>
#endif
#if (CRYPTOPP_MSC_VERSION >= 1400)
# include <intrin.h>
#endif
// Baseline include
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
# include <emmintrin.h> // __m64, __m128i, _mm_set_epi64x
#endif
#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE
# include <tmmintrin.h> // _mm_shuffle_pi8, _mm_shuffle_epi8
#endif // tmmintrin.h
#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
# include <smmintrin.h> // _mm_blend_epi16
# include <nmmintrin.h> // _mm_crc32_u{8|16|32}
#endif // smmintrin.h
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
# include <wmmintrin.h> // aesenc, aesdec, etc
#endif // wmmintrin.h
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
# include <immintrin.h> // RDRAND, RDSEED, AVX, SHA
#endif // immintrin.h
#endif // X86/X64/X32 Headers
// Applies to both X86/X32/X64 and ARM32/ARM64. And we've got MIPS devices on the way.
#if defined(_MSC_VER) || defined(__BORLANDC__)
# define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#else
# define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
#endif
// Applies to both X86/X32/X64 and ARM32/ARM64
#if defined(CRYPTOPP_LLVM_CLANG_VERSION) || defined(CRYPTOPP_APPLE_CLANG_VERSION) || defined(CRYPTOPP_CLANG_INTEGRATED_ASSEMBLER)
#define NEW_LINE "\n"
@ -121,11 +57,10 @@ NAMESPACE_BEGIN(CryptoPP)
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
// These should not be used directly
extern CRYPTOPP_DLL bool g_x86DetectionDone;
extern CRYPTOPP_DLL bool g_hasMMX;
extern CRYPTOPP_DLL bool g_hasISSE;
extern CRYPTOPP_DLL bool g_hasSSE2;
extern CRYPTOPP_DLL bool g_hasSSSE3;
extern CRYPTOPP_DLL bool g_hasSSE4;
extern CRYPTOPP_DLL bool g_hasSSE41;
extern CRYPTOPP_DLL bool g_hasSSE42;
extern CRYPTOPP_DLL bool g_hasAESNI;
extern CRYPTOPP_DLL bool g_hasCLMUL;
extern CRYPTOPP_DLL bool g_hasSHA;
@ -140,39 +75,9 @@ extern CRYPTOPP_DLL bool g_hasPadlockPMM;
extern CRYPTOPP_DLL word32 g_cacheLineSize;
CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features();
CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 output[4]);
CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 func, word32 subfunc, word32 output[4]);
#endif // CRYPTOPP_DOXYGEN_PROCESSING
//! \brief Determines MMX availability
//! \returns true if MMX is determined to be available, false otherwise
//! \details MMX, SSE and SSE2 are core processor features for x86_64, and
//! the function always returns true for the platform.
inline bool HasMMX()
{
#if CRYPTOPP_BOOL_X64
return true;
#else
if (!g_x86DetectionDone)
DetectX86Features();
return g_hasMMX;
#endif
}
//! \brief Determines SSE availability
//! \returns true if SSE is determined to be available, false otherwise
//! \details MMX, SSE and SSE2 are core processor features for x86_64, and
//! the function always returns true for the platform.
inline bool HasISSE()
{
#if CRYPTOPP_BOOL_X64
return true;
#else
if (!g_x86DetectionDone)
DetectX86Features();
return g_hasISSE;
#endif
}
//! \brief Determines SSE2 availability
//! \returns true if SSE2 is determined to be available, false otherwise
//! \details MMX, SSE and SSE2 are core processor features for x86_64, and
@ -199,14 +104,24 @@ inline bool HasSSSE3()
return g_hasSSSE3;
}
//! \brief Determines SSE4 availability
//! \returns true if SSE4.1 and SSE4.2 are determined to be available, false otherwise
//! \details HasSSE4() is a runtime check performed using CPUID which requires both SSE4.1 and SSE4.2
inline bool HasSSE4()
//! \brief Determines SSE4.1 availability
//! \returns true if SSE4.1 is determined to be available, false otherwise
//! \details HasSSE41() is a runtime check performed using CPUID
inline bool HasSSE41()
{
if (!g_x86DetectionDone)
DetectX86Features();
return g_hasSSE4;
return g_hasSSE41;
}
//! \brief Determines SSE4.2 availability
//! \returns true if SSE4.2 is determined to be available, false otherwise
//! \details HasSSE42() is a runtime check performed using CPUID
inline bool HasSSE42()
{
if (!g_x86DetectionDone)
DetectX86Features();
return g_hasSSE42;
}
//! \brief Determines AES-NI availability
@ -341,20 +256,25 @@ void CRYPTOPP_API DetectArmFeatures();
//! \brief Determine if an ARM processor has Advanced SIMD available
//! \returns true if the hardware is capable of Advanced SIMD at runtime, false otherwise.
//! \details Advanced SIMD instructions are available under Aarch64 (ARM-64) and Aarch32 (ARM-32).
//! \details Advanced SIMD instructions are available under most ARMv7, Aarch32 and Aarch64.
//! \details Runtime support requires compile time support. When compiling with GCC, you may
//! need to compile with <tt>-mfpu=neon</tt> (32-bit) or <tt>-march=armv8-a</tt>
//! (64-bit). Also see ARM's <tt>__ARM_NEON</tt> preprocessor macro.
inline bool HasNEON()
{
// ASIMD is a core feature on Aarch32 and Aarch64 like SSE2 is a core feature on x86_64
#if defined(__aarch32__) || defined(__aarch64__)
return true;
#else
if (!g_ArmDetectionDone)
DetectArmFeatures();
return g_hasNEON;
#endif
}
//! \brief Determine if an ARM processor provides Polynomial Multiplication (long)
//! \brief Determine if an ARM processor provides Polynomial Multiplication
//! \returns true if the hardware is capable of polynomial multiplications at runtime, false otherwise.
//! \details The multiplication instructions are available under Aarch64 (ARM-64) and Aarch32 (ARM-32).
//! \details The multiplication instructions are available under Aarch32 and Aarch64.
//! \details Runtime support requires compile time support. When compiling with GCC, you may
//! need to compile with <tt>-march=armv8-a+crypto</tt>; while Apple requires
//! <tt>-arch arm64</tt>. Also see ARM's <tt>__ARM_FEATURE_CRYPTO</tt> preprocessor macro.
@ -367,62 +287,74 @@ inline bool HasPMULL()
//! \brief Determine if an ARM processor has CRC32 available
//! \returns true if the hardware is capable of CRC32 at runtime, false otherwise.
//! \details CRC32 instructions provide access to the processor's CRC32 and CRC32-C instructions.
//! They are provided by ARM C Language Extensions 2.0 (ACLE 2.0) and available under Aarch64
//! (ARM-64) and Aarch32 (ARM-32) running on Aarch64 (i.e., an AArch32 execution environment).
//! \details CRC32 instructions provide access to the processor's CRC-32 and CRC-32C instructions.
//! They are provided by ARM C Language Extensions 2.0 (ACLE 2.0) and available under Aarch32 and Aarch64.
//! \details Runtime support requires compile time support. When compiling with GCC, you may
//! need to compile with <tt>-march=armv8-a+crc</tt>; while Apple requires
//! <tt>-arch arm64</tt>. Also see ARM's <tt>__ARM_FEATURE_CRC32</tt> preprocessor macro.
inline bool HasCRC32()
{
#if defined(__aarch32__) || defined(__aarch64__)
if (!g_ArmDetectionDone)
DetectArmFeatures();
return g_hasCRC32;
#else
return false;
#endif
}
//! \brief Determine if an ARM processor has AES available
//! \returns true if the hardware is capable of AES at runtime, false otherwise.
//! \details AES is part of the Crypto extensions from ARM C Language Extensions 2.0 (ACLE 2.0)
//! and available under Aarch64 (ARM-64) and Aarch32 (ARM-32) running on Aarch64 (i.e., an
//! AArch32 execution environment).
//! \details AES is part of the optional Crypto extensions on Aarch32 and Aarch64. They are
//! accessed using ARM C Language Extensions 2.0 (ACLE 2.0).
//! \details Runtime support requires compile time support. When compiling with GCC, you may
//! need to compile with <tt>-march=armv8-a+crypto</tt>; while Apple requires
//! <tt>-arch arm64</tt>. Also see ARM's <tt>__ARM_FEATURE_CRYPTO</tt> preprocessor macro.
inline bool HasAES()
{
#if defined(__aarch32__) || defined(__aarch64__)
if (!g_ArmDetectionDone)
DetectArmFeatures();
return g_hasAES;
#else
return false;
#endif
}
//! \brief Determine if an ARM processor has SHA1 available
//! \returns true if the hardware is capable of SHA1 at runtime, false otherwise.
//! \details SHA1 is part of the Crypto extensions from ARM C Language Extensions 2.0 (ACLE 2.0)
//! and available under Aarch64 (ARM-64) and Aarch32 (ARM-32) running on Aarch64 (i.e., an
//! AArch32 execution environment).
//! \details SHA1 is part of the optional Crypto extensions on Aarch32 and Aarch64. They are
//! accessed using ARM C Language Extensions 2.0 (ACLE 2.0).
//! \details Runtime support requires compile time support. When compiling with GCC, you may
//! need to compile with <tt>-march=armv8-a+crypto</tt>; while Apple requires
//! <tt>-arch arm64</tt>. Also see ARM's <tt>__ARM_FEATURE_CRYPTO</tt> preprocessor macro.
inline bool HasSHA1()
{
#if defined(__aarch32__) || defined(__aarch64__)
if (!g_ArmDetectionDone)
DetectArmFeatures();
return g_hasSHA1;
#else
return false;
#endif
}
//! \brief Determine if an ARM processor has SHA2 available
//! \returns true if the hardware is capable of SHA2 at runtime, false otherwise.
//! \details SHA2 is part of the Crypto extensions from ARM C Language Extensions 2.0 (ACLE 2.0)
//! and available under Aarch64 (ARM-64) and Aarch32 (ARM-32) running on Aarch64 (i.e., an
//! AArch32 execution environment).
//! \details SHA2 is part of the optional Crypto extensions on Aarch32 and Aarch64. They are
//! accessed using ARM C Language Extensions 2.0 (ACLE 2.0).
//! \details Runtime support requires compile time support. When compiling with GCC, you may
//! need to compile with <tt>-march=armv8-a+crypto</tt>; while Apple requires
//! <tt>-arch arm64</tt>. Also see ARM's <tt>__ARM_FEATURE_CRYPTO</tt> preprocessor macro.
inline bool HasSHA2()
{
#if defined(__aarch32__) || defined(__aarch64__)
if (!g_ArmDetectionDone)
DetectArmFeatures();
return g_hasSHA2;
#else
return false;
#endif
}
//! \brief Provides the cache line size at runtime
@ -457,7 +389,6 @@ inline int GetCacheLineSize()
#define ASC(x, y) x label##y*newline*
#define AS_HEX(y) 0##y##h
#elif defined(_MSC_VER) || defined(__BORLANDC__)
#define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#define AS1(x) __asm {x}
#define AS2(x, y) __asm {x, y}
#define AS3(x, y, z) __asm {x, y, z}
@ -468,8 +399,6 @@ inline int GetCacheLineSize()
#define CRYPTOPP_NAKED __declspec(naked)
#define AS_HEX(y) 0x##y
#else
#define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
// define these in two steps to allow arguments to be expanded
#define GNU_AS1(x) #x ";" NEW_LINE
#define GNU_AS2(x, y) #x ", " #y ";" NEW_LINE

156
crc-simd.cpp Normal file
View File

@ -0,0 +1,156 @@
// crc-simd.cpp - written and placed in the public domain by
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
//
// This source file uses intrinsics to gain access to SSE4.2 and
// ARMv8a CRC-32 and CRC-32C instructions. A separate source file
// is needed because additional CXXFLAGS are required to enable
// the appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "misc.h"
// Clang and GCC hoops...
#if !(defined(__ARM_FEATURE_CRC32) || defined(_MSC_VER))
# undef CRYPTOPP_ARM_CRC32_AVAILABLE
#endif
#if (CRYPTOPP_SSE42_AVAILABLE)
# include "nmmintrin.h"
#endif
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
// intrinsics without <arm_acle.h> included.
#if (CRYPTOPP_ARM_CRC32_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
# include "arm_acle.h"
#endif
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>
# include <setjmp.h>
#endif
#ifndef EXCEPTION_EXECUTE_HANDLER
# define EXCEPTION_EXECUTE_HANDLER 1
#endif
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
extern "C" {
typedef void (*SigHandler)(int);
static jmp_buf s_jmpSIGILL;
static void SigIllHandler(int)
{
longjmp(s_jmpSIGILL, 1);
}
};
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
bool CPU_ProbeCRC32()
{
#if (CRYPTOPP_ARM_CRC32_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
word32 w=0, x=1; word16 y=2; byte z=3;
w = __crc32w(w,x);
w = __crc32h(w,y);
w = __crc32b(w,z);
w = __crc32cw(w,x);
w = __crc32ch(w,y);
w = __crc32cb(w,z);
result = !!w;
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
#else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpSIGILL))
result = false;
else
{
word32 w=0, x=1; word16 y=2; byte z=3;
w = __crc32w(w,x);
w = __crc32h(w,y);
w = __crc32b(w,z);
w = __crc32cw(w,x);
w = __crc32ch(w,y);
w = __crc32cb(w,z);
// Hack... GCC optimizes away the code and returns true
result = !!w;
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
return false;
#endif // CRYPTOPP_ARM_CRC32_AVAILABLE
}
#endif // ARM32 or ARM64
#if (CRYPTOPP_ARM_CRC32_AVAILABLE)
void CRC32_Update_ARMV8(const byte *s, size_t n, word32& c)
{
for(; !IsAligned<word32>(s) && n > 0; s++, n--)
c = __crc32b(c, *s);
for(; n > 4; s+=4, n-=4)
c = __crc32w(c, *(const word32 *)(void*)s);
for(; n > 0; s++, n--)
c = __crc32b(c, *s);
}
void CRC32C_Update_ARMV8(const byte *s, size_t n, word32& c)
{
for(; !IsAligned<word32>(s) && n > 0; s++, n--)
c = __crc32cb(c, *s);
for(; n > 4; s+=4, n-=4)
c = __crc32cw(c, *(const word32 *)(void*)s);
for(; n > 0; s++, n--)
c = __crc32cb(c, *s);
}
#endif
#if (CRYPTOPP_SSE42_AVAILABLE)
void CRC32C_Update_SSE42(const byte *s, size_t n, word32& c)
{
for(; !IsAligned<word32>(s) && n > 0; s++, n--)
c = _mm_crc32_u8(c, *s);
for(; n > 4; s+=4, n-=4)
c = _mm_crc32_u32(c, *(const word32 *)(void*)s);
for(; n > 0; s++, n--)
c = _mm_crc32_u8(c, *s);
}
#endif
NAMESPACE_END

75
crc.cpp
View File

@ -1,44 +1,23 @@
// crc.cpp - originally written and placed in the public domain by Wei Dai
#include "pch.h"
#include "config.h"
#include "crc.h"
#include "misc.h"
#include "cpu.h"
NAMESPACE_BEGIN(CryptoPP)
// Visual Studio needs VS2008 (1500)
// http://msdn.microsoft.com/en-us/library/bb531394%28v=vs.90%29.aspx
#if defined(_MSC_VER) && (_MSC_VER < 1500)
# undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
// crc-simd.cpp
#if (CRYPTOPP_ARM_CRC32_AVAILABLE)
extern void CRC32_Update_ARMV8(const byte *s, size_t n, word32& c);
extern void CRC32C_Update_ARMV8(const byte *s, size_t n, word32& c);
#endif
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
#if (CRYPTOPP_GCC_VERSION >= 40300 || __INTEL_COMPILER >= 1000 || __SUNPRO_CC >= 0x5110 || CRYPTOPP_LLVM_CLANG_VERSION >= 20300 || CRYPTOPP_APPLE_CLANG_VERSION >= 40000) && !defined(__SSE4_2__) && !defined(_MSC_VER)
GCC_INLINE unsigned int GCC_INLINE_ATTRIB
MM_CRC32_U8(unsigned int crc, unsigned char val)
{
asm ("crc32 %1, %0" : "+r"(crc) : "r"(val));
return crc;
}
GCC_INLINE unsigned int GCC_INLINE_ATTRIB
MM_CRC32_U16(unsigned int crc, unsigned short val)
{
asm ("crc32 %1, %0" : "+r"(crc) : "r"(val));
return crc;
}
GCC_INLINE unsigned int GCC_INLINE_ATTRIB
MM_CRC32_U32(unsigned int crc, unsigned int val)
{
asm ("crc32 %1, %0" : "+r"(crc) : "r"(val));
return crc;
}
#else
#define MM_CRC32_U8(a,b) _mm_crc32_u8(a,b)
#define MM_CRC32_U16(a,b) _mm_crc32_u16(a,b)
#define MM_CRC32_U32(a,b) _mm_crc32_u32(a,b)
// crc-simd.cpp
#if (CRYPTOPP_SSE42_AVAILABLE)
extern void CRC32C_Update_SSE42(const byte *s, size_t n, word32& c);
#endif
#endif // X86/X32/X64
/* Table of CRC-32's of all single byte values (made by makecrc.c) */
const word32 CRC32::m_tab[] = {
@ -158,18 +137,10 @@ CRC32::CRC32()
void CRC32::Update(const byte *s, size_t n)
{
#if (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE)
#if (CRYPTOPP_ARM_CRC32_AVAILABLE)
if (HasCRC32())
{
for(; !IsAligned<word32>(s) && n > 0; s++, n--)
m_crc = __crc32b(m_crc, *s);
for(; n > 4; s+=4, n-=4)
m_crc = __crc32w(m_crc, *(const word32 *)(void*)s);
for(; n > 0; s++, n--)
m_crc = __crc32b(m_crc, *s);
CRC32_Update_ARMV8(s, n, m_crc);
return;
}
#endif
@ -326,32 +297,16 @@ CRC32C::CRC32C()
void CRC32C::Update(const byte *s, size_t n)
{
#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
if (HasSSE4())
#if (CRYPTOPP_SSE42_AVAILABLE)
if (HasSSE42())
{
for(; !IsAligned<word32>(s) && n > 0; s++, n--)
m_crc = MM_CRC32_U8(m_crc, *s);
for(; n > 4; s+=4, n-=4)
m_crc = MM_CRC32_U32(m_crc, *(const word32 *)(void*)s);
for(; n > 0; s++, n--)
m_crc = MM_CRC32_U8(m_crc, *s);
CRC32C_Update_SSE42(s, n, m_crc);
return;
}
#elif (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE)
#elif (CRYPTOPP_ARM_CRC32_AVAILABLE)
if (HasCRC32())
{
for(; !IsAligned<word32>(s) && n > 0; s++, n--)
m_crc = __crc32cb(m_crc, *s);
for(; n > 4; s+=4, n-=4)
m_crc = __crc32cw(m_crc, *(const word32 *)(void*)s);
for(; n > 0; s++, n--)
m_crc = __crc32cb(m_crc, *s);
CRC32C_Update_ARMV8(s, n, m_crc);
return;
}
#endif

View File

@ -1,5 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<!-- Microsoft documentation for VCXPROJ file format is located at -->
<!-- the following URL. The documentation leaves a lot to be desired. -->
<!-- https://msdn.microsoft.com/en-us/library/2208a1f2.aspx -->
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
@ -18,6 +21,9 @@
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<!-- Microsoft documentation clearly shows the Global property group -->
<!-- preceeds the import of Cpp.Default.props and Cpp.props -->
<!-- https://msdn.microsoft.com/en-us/library/2208a1f2.aspx -->
<PropertyGroup Label="Globals">
<ProjectGuid>{94a428a1-9ba8-4db2-b76e-bd2e3c08f257}</ProjectGuid>
<RootNamespace>cryptdll</RootNamespace>
@ -203,6 +209,7 @@
<ClCompile Include="fips140.cpp" />
<ClCompile Include="fipstest.cpp" />
<ClCompile Include="gcm.cpp" />
<ClCompile Include="gcm-simd.cpp" />
<ClCompile Include="gf2n.cpp" />
<ClCompile Include="gfpcrypt.cpp" />
<ClCompile Include="hex.cpp" />
@ -222,10 +229,12 @@
<ClCompile Include="randpool.cpp" />
<ClCompile Include="rdtables.cpp" />
<ClCompile Include="rijndael.cpp" />
<ClCompile Include="rijndael-simd.cpp" />
<ClCompile Include="rng.cpp" />
<ClCompile Include="rsa.cpp" />
<ClCompile Include="rw.cpp" />
<ClCompile Include="sha.cpp" />
<ClCompile Include="sha-simd.cpp" />
<ClCompile Include="simple.cpp" />
<ClCompile Include="skipjack.cpp" />
<ClCompile Include="strciphr.cpp" />

View File

@ -89,6 +89,9 @@
<ClCompile Include="gcm.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="gcm-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="gf2n.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -152,6 +155,9 @@
<ClCompile Include="rijndael.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="rijndael-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="rng.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -164,6 +170,9 @@
<ClCompile Include="sha.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="sha-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="simple.cpp">
<Filter>Source Files</Filter>
</ClCompile>

View File

@ -45,12 +45,11 @@
###########################################################################################
# If you use 'make sources' from Linux makefile, then add 'winpipes.cpp'. Platform specific
# classes, like 'rdrand.cpp', should not be included. Add them under the X86 and X64 rules.
# If you use 'make sources' from Linux makefile, then add 'winpipes.cpp' to the list below.
LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp network.cpp oaep.cpp osrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp network.cpp oaep.cpp osrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj channels.obj cmac.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj network.obj oaep.obj osrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj seal.obj seed.obj serpent.obj sha.obj sha3.obj shacal2.obj shark.obj sharkbox.obj skipjack.obj socketft.obj sosemanuk.obj square.obj squaretb.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj
LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj channels.obj cmac.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj network.obj oaep.obj osrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj seal.obj seed.obj serpent.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj skipjack.obj socketft.obj sosemanuk.obj square.obj squaretb.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj
TEST_SRCS = bench1.cpp bench2.cpp test.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp fipstest.cpp
@ -128,6 +127,8 @@ LDLIBS = $(LDLIBS) ws2_32.lib kernel32.lib
!IF "$(PLATFORM)" == "ARM" || "$(PLATFORM)" == "arm" || "$(PLATFORM)" == "ARM64" || "$(PLATFORM)" == "arm64"
# CXXFLAGS = $(CXXFLAGS) /D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 /DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP
CXXFLAGS = $(CXXFLAGS) /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP
LIB_SRCS = $(LIB_SRCS) neon.cpp
LIB_OBJS = $(LIB_OBJS) neon.obj
# CXXFLAGS = $(CXXFLAGS) /DWINAPI_FAMILY=WINAPI_FAMILY_APP
# LDLIBS = $(LDLIBS) ws2_32.lib
!ENDIF

View File

@ -1171,37 +1171,75 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo
OBJFILE=sha.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
X86_SSE2=$(echo -n "$X86_CPU_FLAGS" | "$GREP" -i -c sse2)
X86_SHA256_HASH_BLOCKS=$(echo -n "$DISASS_TEXT" | "$EGREP" -c 'X86_SHA256_HashBlocks')
X86_SHA256_HASH_BLOCKS=$(echo -n "$DISASS_TEXT" | "$EGREP" -c 'SHA256_HashMultipleBlocks_SSE2')
if [[ ("$X86_SHA256_HASH_BLOCKS" -ne "0") ]]; then
COUNT=$(echo -n "$DISASS_TEXT" | "$EGREP" -i -c '(rol.*0x|ror.*0x)')
if [[ ("$COUNT" -le "600") ]]; then
if [[ ("$COUNT" -le "250") ]]; then
FAILED=1
echo "ERROR: failed to generate rotate immediate instruction (X86_SHA256_HashBlocks)" | tee -a "$TEST_RESULTS"
echo "ERROR: failed to generate rotate immediate instruction (SHA256_HashMultipleBlocks_SSE2)" | tee -a "$TEST_RESULTS"
fi
else
COUNT=$(echo -n "$DISASS_TEXT" | "$EGREP" -i -c '(rol.*0x|ror.*0x)')
if [[ ("$COUNT" -le "1000") ]]; then
if [[ ("$COUNT" -le "500") ]]; then
FAILED=1
echo "ERROR: failed to generate rotate immediate instruction" | tee -a "$TEST_RESULTS"
fi
fi
if [[ ("$X86_SSE2" -ne "0" && "$X86_SHA256_HASH_BLOCKS" -eq "0") ]]; then
echo "ERROR: failed to use X86_SHA256_HashBlocks" | tee -a "$TEST_RESULTS"
echo "ERROR: failed to use SHA256_HashMultipleBlocks_SSE2" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0" && "$X86_SHA256_HASH_BLOCKS" -ne "0") ]]; then
echo "Verified rotate immediate machine instructions (X86_SHA256_HashBlocks)" | tee -a "$TEST_RESULTS"
echo "Verified rotate immediate machine instructions (SHA256_HashMultipleBlocks_SSE2)" | tee -a "$TEST_RESULTS"
elif [[ ("$FAILED" -eq "0") ]]; then
echo "Verified rotate immediate machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# Test CRC-32C code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -msse4.2 adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
X86_CRC32=1
fi
if [[ ("$X86_CRC32" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=crc-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32l)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified crc32b and crc32l machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# Test AES-NI code generation
@ -1216,8 +1254,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo "Testing: X86 AES-NI code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=rijndael.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
OBJFILE=rijndael-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
@ -1278,8 +1316,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo "Testing: X86 carryless multiply code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=gcm.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
OBJFILE=gcm-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
@ -1321,7 +1359,7 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo
OBJFILE=rdrand.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
@ -1348,44 +1386,6 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
fi
fi
############################################
# X86 CRC32 code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -msse4.2 adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
X86_CRC32=1
fi
if [[ ("$X86_CRC32" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=crc.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32l)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified crc32l and crc32b machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# X86 SHA code generation
@ -1400,8 +1400,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t
echo "Testing: X86 SHA code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=sha.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
OBJFILE=sha-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
@ -1469,7 +1469,7 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]
echo "Testing: ARM NEON code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=aria.o; rm -f "$OBJFILE" 2>/dev/null
OBJFILE=aria-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
@ -1516,51 +1516,21 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]
fi
fi
############################################
# ARM carryless multiply code generation
ARM_PMULL=$(echo -n "$ARM_CPU_FLAGS" | "$GREP" -i -c pmull)
if [[ ("$ARM_PMULL" -ne "0" || "$HAVE_ARM_CRYPTO" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM carryless multiply code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=gcm.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -v pmull2 | "$GREP" -i -c pmull)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate pmull instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c pmull2)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate pmull2 instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified pmull and pmull2 machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# ARM CRC32 code generation
ARM_CRC32=$(echo -n "$ARM_CPU_FLAGS" | "$GREP" -i -c crc32)
if [[ ("$ARM_CRC32" -ne "0") ]]; then
"$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crc adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
ARM_CRC32=1
fi
if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_CRC32" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM CRC32 code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=crc.o; rm -f "$OBJFILE" 2>/dev/null
OBJFILE=crc-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
@ -1595,6 +1565,179 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]
echo "Verified crc32cb, crc32cw, crc32b and crc32w machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# ARM carryless multiply code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
ARM_PMULL=1
fi
if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_PMULL" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM carryless multiply code generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=gcm-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -v pmull2 | "$GREP" -i -c pmull)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate pmull instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c pmull2)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate pmull2 instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified pmull and pmull2 machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# ARM SHA code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
ARM_AES=1
fi
if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_AES" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM AES generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=rijndael-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aese)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate aese instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesmc)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate aesmc instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesd)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate aesd instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesimc)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate aesimc instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified aese, aesd, aesmc, aesimc machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
############################################
# ARM SHA code generation
"$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1
if [[ "$?" -eq "0" ]]; then
ARM_SHA=1
fi
if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_SHA" -ne "0") ]]; then
echo
echo "************************************" | tee -a "$TEST_RESULTS"
echo "Testing: ARM SHA generation" | tee -a "$TEST_RESULTS"
echo
OBJFILE=sha-simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0
FAILED=0
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1c)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1c instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1m)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1m instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1p)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1p instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1h)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1h instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1su0)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1su0 instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1su1)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha1su1 instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -v sha256h2 | "$GREP" -i -c sha256h)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha256h instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256h2)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha256h2 instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256su0)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha256su0 instruction" | tee -a "$TEST_RESULTS"
fi
COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256su1)
if [[ ("$COUNT" -eq "0") ]]; then
FAILED=1
echo "ERROR: failed to generate sha256su1 instruction" | tee -a "$TEST_RESULTS"
fi
if [[ ("$FAILED" -eq "0") ]]; then
echo "Verified sha1c, sha1m, sha1p, sha1su0, sha1su1, sha256h, sha256h2, sha256su0, sha256su1 machine instructions" | tee -a "$TEST_RESULTS"
fi
fi
fi
############################################

View File

@ -1,5 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<!-- Microsoft documentation for VCXPROJ file format is located at -->
<!-- the following URL. The documentation leaves a lot to be desired. -->
<!-- https://msdn.microsoft.com/en-us/library/2208a1f2.aspx -->
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
@ -34,6 +37,9 @@
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<!-- Microsoft documentation clearly shows the Global property group -->
<!-- preceeds the import of Cpp.Default.props and Cpp.props -->
<!-- https://msdn.microsoft.com/en-us/library/2208a1f2.aspx -->
<PropertyGroup Label="Globals">
<ProjectGuid>{09cdac08-e6ae-48a9-8de7-0fbc779eebde}</ProjectGuid>
<RootNamespace>cryptest</RootNamespace>

View File

@ -1,5 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<!-- Microsoft documentation for VCXPROJ file format is located at -->
<!-- the following URL. The documentation leaves a lot to be desired. -->
<!-- https://msdn.microsoft.com/en-us/library/2208a1f2.aspx -->
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
@ -34,6 +37,9 @@
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<!-- Microsoft documentation clearly shows the Global property group -->
<!-- preceeds the import of Cpp.Default.props and Cpp.props -->
<!-- https://msdn.microsoft.com/en-us/library/2208a1f2.aspx -->
<PropertyGroup Label="Globals">
<ProjectGuid>{c39f4b46-6e89-4074-902e-ca57073044d2}</ProjectGuid>
<RootNamespace>cryptlib</RootNamespace>
@ -166,6 +172,8 @@
<ClCompile Include="algparam.cpp" />
<ClCompile Include="arc4.cpp" />
<ClCompile Include="aria.cpp" />
<ClCompile Include="aria-simd.cpp" />
<ClCompile Include="ariatab.cpp" />
<ClCompile Include="asn.cpp" />
<ClCompile Include="authenc.cpp" />
<ClCompile Include="base32.cpp" />
@ -173,6 +181,7 @@
<ClCompile Include="basecode.cpp" />
<ClCompile Include="bfinit.cpp" />
<ClCompile Include="blake2.cpp" />
<ClCompile Include="blake2-simd.cpp" />
<ClCompile Include="blowfish.cpp" />
<ClCompile Include="blumshub.cpp" />
<ClCompile Include="camellia.cpp" />
@ -184,6 +193,7 @@
<ClCompile Include="channels.cpp" />
<ClCompile Include="cmac.cpp" />
<ClCompile Include="crc.cpp" />
<ClCompile Include="crc-simd.cpp" />
<ClCompile Include="default.cpp" />
<ClCompile Include="des.cpp" />
<ClCompile Include="dessp.cpp" />
@ -210,6 +220,7 @@
<ClCompile Include="fips140.cpp" />
<ClCompile Include="fipstest.cpp" />
<ClCompile Include="gcm.cpp" />
<ClCompile Include="gcm-simd.cpp" />
<ClCompile Include="gf256.cpp" />
<ClCompile Include="gf2_32.cpp" />
<ClCompile Include="gf2n.cpp" />
@ -259,6 +270,7 @@
<ClCompile Include="rdrand.cpp" />
<ClCompile Include="rdtables.cpp" />
<ClCompile Include="rijndael.cpp" />
<ClCompile Include="rijndael-simd.cpp" />
<ClCompile Include="ripemd.cpp" />
<ClCompile Include="rng.cpp" />
<ClCompile Include="rsa.cpp" />
@ -269,8 +281,10 @@
<ClCompile Include="seed.cpp" />
<ClCompile Include="serpent.cpp" />
<ClCompile Include="sha.cpp" />
<ClCompile Include="sha-simd.cpp" />
<ClCompile Include="sha3.cpp" />
<ClCompile Include="shacal2.cpp" />
<ClCompile Include="shacal2-simd.cpp" />
<ClCompile Include="shark.cpp" />
<ClCompile Include="sharkbox.cpp" />
<ClCompile Include="simple.cpp" />

View File

@ -32,6 +32,12 @@
<ClCompile Include="aria.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="aria-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="ariatab.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="asn.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -53,6 +59,9 @@
<ClCompile Include="blake2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="blake2-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="blowfish.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -89,6 +98,9 @@
<ClCompile Include="crc.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="crc-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="cryptlib.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -152,6 +164,9 @@
<ClCompile Include="gcm.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="gcm-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="gf256.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -290,6 +305,9 @@
<ClCompile Include="rijndael.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="rijndael-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="ripemd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -320,12 +338,18 @@
<ClCompile Include="sha.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="sha-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="sha3.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="shacal2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="shacal2-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="shark.cpp">
<Filter>Source Files</Filter>
</ClCompile>

View File

@ -1,5 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<!-- Microsoft documentation for VCXPROJ file format is located at -->
<!-- the following URL. The documentation leaves a lot to be desired. -->
<!-- https://msdn.microsoft.com/en-us/library/2208a1f2.aspx -->
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
@ -18,6 +21,9 @@
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<!-- Microsoft documentation clearly shows the Global property group -->
<!-- preceeds the import of Cpp.Default.props and Cpp.props -->
<!-- https://msdn.microsoft.com/en-us/library/2208a1f2.aspx -->
<PropertyGroup Label="Globals">
<ProjectGuid>{1974a53a-9863-41c9-886d-b2b8c2fc3c8b}</ProjectGuid>
<RootNamespace>dlltest</RootNamespace>

14
dlltest.vcxproj.filters Normal file
View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{d7fe0401-fa2d-40cd-80b9-b91f937996a3}</UniqueIdentifier>
<Extensions>.cpp</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="dlltest.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
</Project>

610
gcm-simd.cpp Normal file
View File

@ -0,0 +1,610 @@
// gcm-simd.cpp - written and placed in the public domain by
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
//
// This source file uses intrinsics to gain access to SSE4.2 and
// ARMv8a CRC-32 and CRC-32C instructions. A separate source file
// is needed because additional CXXFLAGS are required to enable
// the appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "misc.h"
// Clang 3.3 integrated assembler crash on Linux. Other versions produce incorrect results.
// Clang has never handled Intel ASM very well. I wish LLVM would fix it.
#if defined(__clang__)
# undef CRYPTOPP_X86_ASM_AVAILABLE
# undef CRYPTOPP_X32_ASM_AVAILABLE
# undef CRYPTOPP_X64_ASM_AVAILABLE
# undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#endif
// Clang and GCC hoops...
#if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER))
# undef CRYPTOPP_ARM_PMULL_AVAILABLE
#endif
#if (CRYPTOPP_CLMUL_AVAILABLE)
# include "tmmintrin.h"
# include "wmmintrin.h"
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
# include "arm_neon.h"
#endif
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
// intrinsics without <arm_acle.h> included.
#if (CRYPTOPP_ARM_PMULL_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
# include "arm_acle.h"
#endif
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>
# include <setjmp.h>
#endif
#ifndef EXCEPTION_EXECUTE_HANDLER
# define EXCEPTION_EXECUTE_HANDLER 1
#endif
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
ANONYMOUS_NAMESPACE_BEGIN
// GCC 4.8 is missing PMULL gear
#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
# if (CRYPTOPP_GCC_VERSION >= 40800) && (CRYPTOPP_GCC_VERSION < 49000)
inline poly128_t VMULL_P64(poly64_t a, poly64_t b)
{
return __builtin_aarch64_crypto_pmulldi_ppp (a, b);
}
inline poly128_t VMULL_HIGH_P64(poly64x2_t a, poly64x2_t b)
{
return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
}
# else
inline poly128_t VMULL_P64(poly64_t a, poly64_t b)
{
return vmull_p64(a, b);
}
inline poly128_t VMULL_HIGH_P64(poly64x2_t a, poly64x2_t b)
{
return vmull_high_p64(a, b);
}
# endif
#endif
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_ARM_PMULL_AVAILABLE
#if defined(__GNUC__)
// Schneiders, Hovsmith and O'Rourke used this trick.
// It results in much better code generation in production code
// by avoiding D-register spills when using vgetq_lane_u64. The
// problem does not surface under minimal test cases.
inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
:"=w" (r) : "w" (a), "w" (b) );
return r;
}
inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
:"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
return r;
}
inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
:"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
return r;
}
inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t"
:"=w" (r) : "w" (a), "w" (b) );
return r;
}
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
{
uint64x2_t r;
__asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
:"=w" (r) : "w" (a), "w" (b), "I" (c) );
return r;
}
// https://github.com/weidai11/cryptopp/issues/366
template <unsigned int C>
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
:"=w" (r) : "w" (a), "w" (b), "I" (C) );
return r;
}
#endif // GCC and compatibles
#if defined(_MSC_VER)
inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
{
return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
}
inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
{
return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
}
inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
{
return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
}
inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
{
return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
}
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
{
return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
}
// https://github.com/weidai11/cryptopp/issues/366
template <unsigned int C>
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
{
return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
}
#endif // Microsoft and compatibles
#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
extern "C" {
typedef void (*SigHandler)(int);
static jmp_buf s_jmpSIGILL;
static void SigIllHandler(int)
{
longjmp(s_jmpSIGILL, 1);
}
};
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
bool CPU_ProbePMULL()
{
#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = vmull_p64(a1, b1);
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpSIGILL))
result = false;
else
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = VMULL_P64(a1, b1);
const poly128_t r2 = VMULL_HIGH_P64((poly64x2_t)(a2), (poly64x2_t)(b2));
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
return false;
#endif // CRYPTOPP_ARM_SHA_AVAILABLE
}
#endif // ARM32 or ARM64
#if CRYPTOPP_ARM_NEON_AVAILABLE
void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c)
{
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<uint64x2_t>()));
*(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c);
}
#endif
#if CRYPTOPP_ARM_PMULL_AVAILABLE
ANONYMOUS_NAMESPACE_BEGIN
CRYPTOPP_ALIGN_DATA(16)
const word64 s_clmulConstants64[] = {
W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8
};
const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64;
const unsigned int s_clmulTableSizeInBlocks = 8;
ANONYMOUS_NAMESPACE_END
uint64x2_t GCM_Reduce_PMULL(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r)
{
c1 = veorq_u64(c1, VEXT_U8<8>(vdupq_n_u64(0), c0));
c1 = veorq_u64(c1, PMULL_01(c0, r));
c0 = VEXT_U8<8>(c0, vdupq_n_u64(0));
c0 = vshlq_n_u64(veorq_u64(c0, c1), 1);
c0 = PMULL_00(c0, r);
c2 = veorq_u64(c2, c0);
c2 = veorq_u64(c2, VEXT_U8<8>(c1, vdupq_n_u64(0)));
c1 = vshrq_n_u64(vcombine_u64(vget_low_u64(c1), vget_low_u64(c2)), 63);
c2 = vshlq_n_u64(c2, 1);
return veorq_u64(c2, c1);
}
uint64x2_t GCM_Multiply_PMULL(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r)
{
const uint64x2_t c0 = PMULL_00(x, h);
const uint64x2_t c1 = veorq_u64(PMULL_10(x, h), PMULL_01(x, h));
const uint64x2_t c2 = PMULL_11(x, h);
return GCM_Reduce_PMULL(c0, c1, c2, r);
}
void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize)
{
const uint64x2_t r = s_clmulConstants[0];
const uint64x2_t t = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(hashKey)));
const uint64x2_t h0 = vextq_u64(t, t, 1);
uint64x2_t h = h0;
unsigned int i;
for (i=0; i<tableSize-32; i+=32)
{
const uint64x2_t h1 = GCM_Multiply_PMULL(h, h0, r);
vst1_u64((uint64_t *)(mulTable+i), vget_low_u64(h));
vst1q_u64((uint64_t *)(mulTable+i+16), h1);
vst1q_u64((uint64_t *)(mulTable+i+8), h);
vst1_u64((uint64_t *)(mulTable+i+8), vget_low_u64(h1));
h = GCM_Multiply_PMULL(h1, h0, r);
}
const uint64x2_t h1 = GCM_Multiply_PMULL(h, h0, r);
vst1_u64((uint64_t *)(mulTable+i), vget_low_u64(h));
vst1q_u64((uint64_t *)(mulTable+i+16), h1);
vst1q_u64((uint64_t *)(mulTable+i+8), h);
vst1_u64((uint64_t *)(mulTable+i+8), vget_low_u64(h1));
}
size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
{
const uint64x2_t* table = reinterpret_cast<const uint64x2_t*>(mtable);
uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(hbuffer));
const uint64x2_t r = s_clmulConstants[0];
const size_t BLOCKSIZE = 16;
while (len >= BLOCKSIZE)
{
size_t s = UnsignedMin(len/BLOCKSIZE, s_clmulTableSizeInBlocks), i=0;
uint64x2_t d1, d2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-1)*BLOCKSIZE)));
uint64x2_t c0 = vdupq_n_u64(0);
uint64x2_t c1 = vdupq_n_u64(0);
uint64x2_t c2 = vdupq_n_u64(0);
while (true)
{
const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i));
const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1));
const uint64x2_t h2 = veorq_u64(h0, h1);
if (++i == s)
{
const uint64x2_t t1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data)));
d1 = veorq_u64(vextq_u64(t1, t1, 1), x);
c0 = veorq_u64(c0, PMULL_00(d1, h0));
c2 = veorq_u64(c2, PMULL_10(d1, h1));
d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)),
vget_low_u32(vreinterpretq_u32_u64(d1))));
c1 = veorq_u64(c1, PMULL_00(d1, h2));
break;
}
d1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8)));
c0 = veorq_u64(c0, PMULL_10(d2, h0));
c2 = veorq_u64(c2, PMULL_10(d1, h1));
d2 = veorq_u64(d2, d1);
c1 = veorq_u64(c1, PMULL_10(d2, h2));
if (++i == s)
{
const uint64x2_t t2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data)));
d1 = veorq_u64(vextq_u64(t2, t2, 1), x);
c0 = veorq_u64(c0, PMULL_01(d1, h0));
c2 = veorq_u64(c2, PMULL_11(d1, h1));
d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)),
vget_low_u32(vreinterpretq_u32_u64(d1))));
c1 = veorq_u64(c1, PMULL_01(d1, h2));
break;
}
const uint64x2_t t3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8)));
d2 = vextq_u64(t3, t3, 1);
c0 = veorq_u64(c0, PMULL_01(d1, h0));
c2 = veorq_u64(c2, PMULL_01(d2, h1));
d1 = veorq_u64(d1, d2);
c1 = veorq_u64(c1, PMULL_01(d1, h2));
}
data += s*16;
len -= s*16;
c1 = veorq_u64(veorq_u64(c1, c0), c2);
x = GCM_Reduce_PMULL(c0, c1, c2, r);
}
vst1q_u64(reinterpret_cast<uint64_t *>(hbuffer), x);
return len;
}
void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer)
{
if (GetNativeByteOrder() != BIG_ENDIAN_ORDER)
{
const uint8x16_t x = vrev64q_u8(vld1q_u8(hashBuffer));
vst1q_u8(hashBuffer, vextq_u8(x, x, 8));
}
}
#endif
#if CRYPTOPP_CLMUL_AVAILABLE
ANONYMOUS_NAMESPACE_BEGIN
CRYPTOPP_ALIGN_DATA(16)
const word64 s_clmulConstants64[] = {
W64LIT(0xe100000000000000), W64LIT(0xc200000000000000),
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607),
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)};
const __m128i *s_clmulConstants = CONST_M128_CAST(s_clmulConstants64);
const unsigned int s_cltableSizeInBlocks = 8;
ANONYMOUS_NAMESPACE_END
#if 0
// preserved for testing
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
{
word64 Z0=0, Z1=0, V0, V1;
typedef BlockGetAndPut<word64, BigEndian> Block;
Block::Get(a)(V0)(V1);
for (int i=0; i<16; i++)
{
for (int j=0x80; j!=0; j>>=1)
{
int x = b[i] & j;
Z0 ^= x ? V0 : 0;
Z1 ^= x ? V1 : 0;
x = (int)V1 & 1;
V1 = (V1>>1) | (V0<<63);
V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
}
}
Block::Put(NULLPTR, c)(Z0)(Z1);
}
__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
{
word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
PolynomialMod2 pa((byte *)A, 8);
PolynomialMod2 pb((byte *)B, 8);
PolynomialMod2 c = pa*pb;
__m128i output;
for (int i=0; i<16; i++)
((byte *)&output)[i] = c.GetByte(i);
return output;
}
#endif // Testing
__m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
{
/*
The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most
significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the
rightmost bit positions, and the lowest byte addresses.
c1 ^= c0t * 0xc200000000000000
c2t ^= c0t
t = shift (c1t ^ c0b) left 1 bit
c2 ^= t * 0xe100000000000000
c2t ^= c1b
shift c2 left 1 bit and xor in lowest bit of c1t
*/
c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
c0 = _mm_srli_si128(c0, 8);
c0 = _mm_xor_si128(c0, c1);
c0 = _mm_slli_epi64(c0, 1);
c0 = _mm_clmulepi64_si128(c0, r, 0);
c2 = _mm_xor_si128(c2, c0);
c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8));
c1 = _mm_unpacklo_epi64(c1, c2);
c1 = _mm_srli_epi64(c1, 63);
c2 = _mm_slli_epi64(c2, 1);
return _mm_xor_si128(c2, c1);
}
__m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r)
{
const __m128i c0 = _mm_clmulepi64_si128(x,h,0);
const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
return GCM_Reduce_CLMUL(c0, c1, c2, r);
}
void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize)
{
const __m128i r = s_clmulConstants[0];
const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), s_clmulConstants[1]);
__m128i h = h0;
unsigned int i;
for (i=0; i<tableSize-32; i+=32)
{
const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
_mm_storel_epi64(M128_CAST(mulTable+i), h);
_mm_storeu_si128(M128_CAST(mulTable+i+16), h1);
_mm_storeu_si128(M128_CAST(mulTable+i+8), h);
_mm_storel_epi64(M128_CAST(mulTable+i+8), h1);
h = GCM_Multiply_CLMUL(h1, h0, r);
}
const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
_mm_storel_epi64(M128_CAST(mulTable+i), h);
_mm_storeu_si128(M128_CAST(mulTable+i+16), h1);
_mm_storeu_si128(M128_CAST(mulTable+i+8), h);
_mm_storel_epi64(M128_CAST(mulTable+i+8), h1);
}
size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
{
const __m128i *table = CONST_M128_CAST(mtable);
__m128i x = _mm_load_si128(M128_CAST(hbuffer));
const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2];
while (len >= 16)
{
size_t s = UnsignedMin(len/16, s_cltableSizeInBlocks), i=0;
__m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-1)*16)), mask2);
__m128i c0 = _mm_setzero_si128();
__m128i c1 = _mm_setzero_si128();
__m128i c2 = _mm_setzero_si128();
while (true)
{
const __m128i h0 = _mm_load_si128(table+i);
const __m128i h1 = _mm_load_si128(table+i+1);
const __m128i h2 = _mm_xor_si128(h0, h1);
if (++i == s)
{
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1);
d1 = _mm_xor_si128(d1, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2)));
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0));
break;
}
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask2);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
d2 = _mm_xor_si128(d2, d1);
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1));
if (++i == s)
{
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1);
d1 = _mm_xor_si128(d1, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11));
d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2)));
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10));
break;
}
d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask1);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
d1 = _mm_xor_si128(d1, d2);
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10));
}
data += s*16;
len -= s*16;
c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2);
x = GCM_Reduce_CLMUL(c0, c1, c2, r);
}
_mm_store_si128(M128_CAST(hbuffer), x);
return len;
}
void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer)
{
// SSSE3 instruction, but only used with CLMUL
__m128i &x = *M128_CAST(hashBuffer);
x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
}
#endif
NAMESPACE_END

530
gcm.cpp
View File

@ -9,10 +9,6 @@
#include "pch.h"
#include "config.h"
#if CRYPTOPP_MSC_VERSION
# pragma warning(disable: 4189)
#endif
#ifndef CRYPTOPP_IMPORTS
#ifndef CRYPTOPP_GENERATE_X64_MASM
@ -31,15 +27,15 @@
// # undef CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
// #endif
// Clang casts
#define M128I_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
#include "gcm.h"
#include "cpu.h"
NAMESPACE_BEGIN(CryptoPP)
#if (CRYPTOPP_SSE2_AVAILABLE)
# include "emmintrin.h"
#endif
#if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64)
// Different assemblers accept different mnemonics: 'movd eax, xmm0' vs
// 'movd rax, xmm0' vs 'mov eax, xmm0' vs 'mov rax, xmm0'
@ -55,101 +51,13 @@ NAMESPACE_BEGIN(CryptoPP)
#endif
#endif // CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#if defined(__GNUC__)
// Schneiders, Hovsmith and O'Rourke used this trick.
// It results in much better code generation in production code
// by avoiding D-register spills when using vgetq_lane_u64. The
// problem does not surface under minimal test cases.
inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
:"=w" (r) : "w" (a), "w" (b) );
return r;
}
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
:"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
return r;
}
inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
:"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
return r;
}
inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t"
:"=w" (r) : "w" (a), "w" (b) );
return r;
}
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
{
uint64x2_t r;
__asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
:"=w" (r) : "w" (a), "w" (b), "I" (c) );
return r;
}
// https://github.com/weidai11/cryptopp/issues/366
template <unsigned int C>
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
{
uint64x2_t r;
__asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
:"=w" (r) : "w" (a), "w" (b), "I" (C) );
return r;
}
#endif // GCC and compatibles
#if defined(_MSC_VER)
inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
{
return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
}
inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
{
return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
}
inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
{
return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
}
inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
{
return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
}
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
{
return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
}
// https://github.com/weidai11/cryptopp/issues/366
template <unsigned int C>
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
{
return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
}
#endif // Microsoft and compatibles
#endif // CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#if CRYPTOPP_ARM_NEON_AVAILABLE
extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c);
#endif
word16 GCM_Base::s_reductionTable[256];
volatile bool GCM_Base::s_reductionTableInitialized = false;
@ -159,74 +67,6 @@ void GCM_Base::GCTR::IncrementCounterBy256()
IncrementCounterByOne(m_counterArray+BlockSize()-4, 3);
}
#if 0
// preserved for testing
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
{
word64 Z0=0, Z1=0, V0, V1;
typedef BlockGetAndPut<word64, BigEndian> Block;
Block::Get(a)(V0)(V1);
for (int i=0; i<16; i++)
{
for (int j=0x80; j!=0; j>>=1)
{
int x = b[i] & j;
Z0 ^= x ? V0 : 0;
Z1 ^= x ? V1 : 0;
x = (int)V1 & 1;
V1 = (V1>>1) | (V0<<63);
V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
}
}
Block::Put(NULLPTR, c)(Z0)(Z1);
}
__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
{
word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
PolynomialMod2 pa((byte *)A, 8);
PolynomialMod2 pb((byte *)B, 8);
PolynomialMod2 c = pa*pb;
__m128i output;
for (int i=0; i<16; i++)
((byte *)&output)[i] = c.GetByte(i);
return output;
}
#endif
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
{
// SunCC 5.14 crash (bewildering since asserts are not in effect in release builds)
// Also see http://github.com/weidai11/cryptopp/issues/226 and http://github.com/weidai11/cryptopp/issues/284
# if __SUNPRO_CC
*M128I_CAST(a) = _mm_xor_si128(*M128I_CAST(b), *M128I_CAST(c));
# elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<__m128i>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<__m128i>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<__m128i>()));
*M128I_CAST(a) = _mm_xor_si128(*M128I_CAST(b), *M128I_CAST(c));
# else
asm ("movdqa %1, %%xmm0; pxor %2, %%xmm0; movdqa %%xmm0, %0;" : "=m" (a[0]) : "m"(b[0]), "m"(c[0]));
# endif
}
#endif
#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
inline static void NEON_Xor16(byte *a, const byte *b, const byte *c)
{
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<uint64x2_t>()));
*(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c);
}
#endif
inline static void Xor16(byte *a, const byte *b, const byte *c)
{
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
@ -236,120 +76,73 @@ inline static void Xor16(byte *a, const byte *b, const byte *c)
((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
}
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
CRYPTOPP_ALIGN_DATA(16)
static const word64 s_clmulConstants64[] = {
W64LIT(0xe100000000000000), W64LIT(0xc200000000000000),
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607),
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)};
static const __m128i *s_clmulConstants = CONST_M128I_CAST(s_clmulConstants64);
static const unsigned int s_clmulTableSizeInBlocks = 8;
inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
#if CRYPTOPP_SSE2_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
inline static void GCM_Xor16_SSE2(byte *a, const byte *b, const byte *c)
{
/*
The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most
significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the
rightmost bit positions, and the lowest byte addresses.
c1 ^= c0t * 0xc200000000000000
c2t ^= c0t
t = shift (c1t ^ c0b) left 1 bit
c2 ^= t * 0xe100000000000000
c2t ^= c1b
shift c2 left 1 bit and xor in lowest bit of c1t
*/
#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301
c2 = _mm_xor_si128(c2, _mm_move_epi64(c0));
#else
c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
#endif
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
c0 = _mm_srli_si128(c0, 8);
c0 = _mm_xor_si128(c0, c1);
c0 = _mm_slli_epi64(c0, 1);
c0 = _mm_clmulepi64_si128(c0, r, 0);
c2 = _mm_xor_si128(c2, c0);
c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8));
c1 = _mm_unpacklo_epi64(c1, c2);
c1 = _mm_srli_epi64(c1, 63);
c2 = _mm_slli_epi64(c2, 1);
return _mm_xor_si128(c2, c1);
}
inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r)
{
const __m128i c0 = _mm_clmulepi64_si128(x,h,0);
const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
return CLMUL_Reduce(c0, c1, c2, r);
// SunCC 5.14 crash (bewildering since asserts are not in effect in release builds)
// Also see http://github.com/weidai11/cryptopp/issues/226 and http://github.com/weidai11/cryptopp/issues/284
# if __SUNPRO_CC
*M128_CAST(a) = _mm_xor_si128(*M128_CAST(b), *M128_CAST(c));
# elif CRYPTOPP_SSE2_AVAILABLE
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<__m128i>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<__m128i>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<__m128i>()));
*M128_CAST(a) = _mm_xor_si128(*M128_CAST(b), *M128_CAST(c));
# else
asm ("movdqa %1, %%xmm0; pxor %2, %%xmm0; movdqa %%xmm0, %0;" : "=m" (a[0]) : "m"(b[0]), "m"(c[0]));
# endif
}
#endif
#if CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#if CRYPTOPP_CLMUL_AVAILABLE
extern void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
extern size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
const unsigned int s_cltableSizeInBlocks = 8;
extern void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer);
#endif // CRYPTOPP_CLMUL_AVAILABLE
CRYPTOPP_ALIGN_DATA(16)
static const word64 s_clmulConstants64[] = {
W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8
};
static const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64;
static const unsigned int s_clmulTableSizeInBlocks = 8;
inline uint64x2_t PMULL_Reduce(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r)
{
// See comments fo CLMUL_Reduce
c1 = veorq_u64(c1, VEXT_U8<8>(vdupq_n_u64(0), c0));
c1 = veorq_u64(c1, PMULL_01(c0, r));
c0 = VEXT_U8<8>(c0, vdupq_n_u64(0));
c0 = vshlq_n_u64(veorq_u64(c0, c1), 1);
c0 = PMULL_00(c0, r);
c2 = veorq_u64(c2, c0);
c2 = veorq_u64(c2, VEXT_U8<8>(c1, vdupq_n_u64(0)));
c1 = vshrq_n_u64(vcombine_u64(vget_low_u64(c1), vget_low_u64(c2)), 63);
c2 = vshlq_n_u64(c2, 1);
return veorq_u64(c2, c1);
}
inline uint64x2_t PMULL_GF_Mul(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r)
{
const uint64x2_t c0 = PMULL_00(x, h);
const uint64x2_t c1 = veorq_u64(PMULL_10(x, h), PMULL_01(x, h));
const uint64x2_t c2 = PMULL_11(x, h);
return PMULL_Reduce(c0, c1, c2, r);
}
#if CRYPTOPP_ARM_PMULL_AVAILABLE
extern void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer);
#endif
#if CRYPTOPP_ARM_PMULL_AVAILABLE
extern void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
extern size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
const unsigned int s_cltableSizeInBlocks = 8;
#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs &params)
{
BlockCipher &blockCipher = AccessBlockCipher();
blockCipher.SetKey(userKey, keylength, params);
// GCM is only defined for 16-byte block ciphers at the moment.
// However, variable blocksize support means we have to defer
// blocksize checks to runtime after the key is set. Also see
// https://github.com/weidai11/cryptopp/issues/408.
const unsigned int blockSize = blockCipher.BlockSize();
CRYPTOPP_ASSERT(blockSize == REQUIRED_BLOCKSIZE);
if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
int tableSize, i, j, k;
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_CLMUL_AVAILABLE
if (HasCLMUL())
{
// Avoid "parameter not used" error and suppress Coverity finding
(void)params.GetIntValue(Name::TableSize(), tableSize);
tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
tableSize = s_cltableSizeInBlocks * blockSize;
CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
}
else
#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#elif CRYPTOPP_ARM_PMULL_AVAILABLE
if (HasPMULL())
{
// Avoid "parameter not used" error and suppress Coverity finding
(void)params.GetIntValue(Name::TableSize(), tableSize);
tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
tableSize = s_cltableSizeInBlocks * blockSize;
CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
}
else
#endif
@ -359,61 +152,28 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
else
tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024;
#if defined(_MSC_VER) && (_MSC_VER < 1400)
//#if defined(_MSC_VER) && (_MSC_VER < 1400)
// VC 2003 workaround: compiler generates bad code for 64K tables
tableSize = 2*1024;
#endif
//tableSize = 2*1024;
//#endif
}
m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize);
m_buffer.resize(3*blockSize + tableSize);
byte *mulTable = MulTable();
byte *hashKey = HashKey();
memset(hashKey, 0, REQUIRED_BLOCKSIZE);
blockCipher.ProcessBlock(hashKey);
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_CLMUL_AVAILABLE
if (HasCLMUL())
{
const __m128i r = s_clmulConstants[0];
__m128i h0 = _mm_shuffle_epi8(_mm_load_si128(M128I_CAST(hashKey)), s_clmulConstants[1]);
__m128i h = h0;
for (i=0; i<tableSize; i+=32)
{
__m128i h1 = CLMUL_GF_Mul(h, h0, r);
_mm_storel_epi64(M128I_CAST(mulTable+i), h);
_mm_storeu_si128(M128I_CAST(mulTable+i+16), h1);
_mm_storeu_si128(M128I_CAST(mulTable+i+8), h);
_mm_storel_epi64(M128I_CAST(mulTable+i+8), h1);
h = CLMUL_GF_Mul(h1, h0, r);
}
GCM_SetKeyWithoutResync_CLMUL(hashKey, mulTable, tableSize);
return;
}
#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#elif CRYPTOPP_ARM_PMULL_AVAILABLE
if (HasPMULL())
{
const uint64x2_t r = s_clmulConstants[0];
const uint64x2_t t = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(hashKey)));
const uint64x2_t h0 = vextq_u64(t, t, 1);
uint64x2_t h = h0;
for (i=0; i<tableSize-32; i+=32)
{
const uint64x2_t h1 = PMULL_GF_Mul(h, h0, r);
vst1_u64((uint64_t *)(mulTable+i), vget_low_u64(h));
vst1q_u64((uint64_t *)(mulTable+i+16), h1);
vst1q_u64((uint64_t *)(mulTable+i+8), h);
vst1_u64((uint64_t *)(mulTable+i+8), vget_low_u64(h1));
h = PMULL_GF_Mul(h1, h0, r);
}
const uint64x2_t h1 = PMULL_GF_Mul(h, h0, r);
vst1_u64((uint64_t *)(mulTable+i), vget_low_u64(h));
vst1q_u64((uint64_t *)(mulTable+i+16), h1);
vst1q_u64((uint64_t *)(mulTable+i+8), h);
vst1_u64((uint64_t *)(mulTable+i+8), vget_low_u64(h1));
GCM_SetKeyWithoutResync_PMULL(hashKey, mulTable, tableSize);
return;
}
#endif
@ -437,17 +197,17 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
for (i=0; i<16; i++)
{
memset(mulTable+i*256*16, 0, 16);
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_SSE2_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2())
for (j=2; j<=0x80; j*=2)
for (k=1; k<j; k++)
SSE2_Xor16(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
GCM_Xor16_SSE2(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
else
#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
#elif CRYPTOPP_ARM_NEON_AVAILABLE
if (HasNEON())
for (j=2; j<=0x80; j*=2)
for (k=1; k<j; k++)
NEON_Xor16(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
GCM_Xor16_NEON(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
else
#endif
for (j=2; j<=0x80; j*=2)
@ -489,22 +249,22 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
{
memset(mulTable+i*256, 0, 16);
memset(mulTable+1024+i*256, 0, 16);
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_SSE2_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2())
for (j=2; j<=8; j*=2)
for (k=1; k<j; k++)
{
SSE2_Xor16(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
SSE2_Xor16(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
GCM_Xor16_SSE2(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
GCM_Xor16_SSE2(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
}
else
#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
#elif CRYPTOPP_ARM_NEON_AVAILABLE
if (HasNEON())
for (j=2; j<=8; j*=2)
for (k=1; k<j; k++)
{
NEON_Xor16(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
NEON_Xor16(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
GCM_Xor16_NEON(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
GCM_Xor16_NEON(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
}
else
#endif
@ -520,20 +280,15 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
inline void GCM_Base::ReverseHashBufferIfNeeded()
{
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_CLMUL_AVAILABLE
if (HasCLMUL())
{
__m128i &x = *M128I_CAST(HashBuffer());
x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
GCM_ReverseHashBufferIfNeeded_CLMUL(HashBuffer());
}
#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#elif CRYPTOPP_ARM_PMULL_AVAILABLE
if (HasPMULL())
{
if (GetNativeByteOrder() != BIG_ENDIAN_ORDER)
{
const uint8x16_t x = vrev64q_u8(vld1q_u8(HashBuffer()));
vst1q_u8(HashBuffer(), vextq_u8(x, x, 8));
}
GCM_ReverseHashBufferIfNeeded_PMULL(HashBuffer());
}
#endif
}
@ -588,8 +343,8 @@ unsigned int GCM_Base::OptimalDataAlignment() const
return
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
HasSSE2() ? 16 :
#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
HasNEON() ? 16 :
#elif CRYPTOPP_ARM_NEON_AVAILABLE
HasNEON() ? 4 :
#endif
GetBlockCipher().OptimalDataAlignment();
}
@ -611,141 +366,16 @@ void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuf
size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
{
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_CLMUL_AVAILABLE
if (HasCLMUL())
{
const __m128i *mulTable = CONST_M128I_CAST(MulTable());
__m128i x = _mm_load_si128(M128I_CAST(HashBuffer()));
const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2];
while (len >= 16)
{
size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
__m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-1)*16)), mask2);
__m128i c0 = _mm_setzero_si128();
__m128i c1 = _mm_setzero_si128();
__m128i c2 = _mm_setzero_si128();
while (true)
{
__m128i h0 = _mm_load_si128(mulTable+i);
__m128i h1 = _mm_load_si128(mulTable+i+1);
__m128i h2 = _mm_xor_si128(h0, h1);
if (++i == s)
{
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data)), mask1);
d1 = _mm_xor_si128(d1, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2)));
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0));
break;
}
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-i)*16-8)), mask2);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
d2 = _mm_xor_si128(d2, d1);
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1));
if (++i == s)
{
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data)), mask1);
d1 = _mm_xor_si128(d1, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11));
d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2)));
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10));
break;
}
d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-i)*16-8)), mask1);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
d1 = _mm_xor_si128(d1, d2);
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10));
}
data += s*16;
len -= s*16;
c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2);
x = CLMUL_Reduce(c0, c1, c2, r);
}
_mm_store_si128(M128I_CAST(HashBuffer()), x);
return len;
return GCM_AuthenticateBlocks_CLMUL(data, len, MulTable(), HashBuffer());
}
#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
#elif CRYPTOPP_ARM_PMULL_AVAILABLE
if (HasPMULL())
{
const uint64x2_t *mulTable = (const uint64x2_t *)MulTable();
uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(HashBuffer()));
const uint64x2_t r = s_clmulConstants[0];
while (len >= 16)
{
size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
uint64x2_t d1, d2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-1)*16)));
uint64x2_t c0 = vdupq_n_u64(0);
uint64x2_t c1 = vdupq_n_u64(0);
uint64x2_t c2 = vdupq_n_u64(0);
while (true)
{
const uint64x2_t h0 = vld1q_u64((const uint64_t*)(mulTable+i));
const uint64x2_t h1 = vld1q_u64((const uint64_t*)(mulTable+i+1));
const uint64x2_t h2 = veorq_u64(h0, h1);
if (++i == s)
{
const uint64x2_t t1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data)));
d1 = veorq_u64(vextq_u64(t1, t1, 1), x);
c0 = veorq_u64(c0, PMULL_00(d1, h0));
c2 = veorq_u64(c2, PMULL_10(d1, h1));
d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)),
vget_low_u32(vreinterpretq_u32_u64(d1))));
c1 = veorq_u64(c1, PMULL_00(d1, h2));
break;
}
d1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8)));
c0 = veorq_u64(c0, PMULL_10(d2, h0));
c2 = veorq_u64(c2, PMULL_10(d1, h1));
d2 = veorq_u64(d2, d1);
c1 = veorq_u64(c1, PMULL_10(d2, h2));
if (++i == s)
{
const uint64x2_t t2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data)));
d1 = veorq_u64(vextq_u64(t2, t2, 1), x);
c0 = veorq_u64(c0, PMULL_01(d1, h0));
c2 = veorq_u64(c2, PMULL_11(d1, h1));
d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)),
vget_low_u32(vreinterpretq_u32_u64(d1))));
c1 = veorq_u64(c1, PMULL_01(d1, h2));
break;
}
const uint64x2_t t3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8)));
d2 = vextq_u64(t3, t3, 1);
c0 = veorq_u64(c0, PMULL_01(d1, h0));
c2 = veorq_u64(c2, PMULL_01(d2, h1));
d1 = veorq_u64(d1, d2);
c1 = veorq_u64(c1, PMULL_01(d1, h2));
}
data += s*16;
len -= s*16;
c1 = veorq_u64(veorq_u64(c1, c0), c2);
x = PMULL_Reduce(c0, c1, c2, r);
}
vst1q_u64((uint64_t *)HashBuffer(), x);
return len;
}
return GCM_AuthenticateBlocks_PMULL(data, len, MulTable(), HashBuffer());
}
#endif
typedef BlockGetAndPut<word64, NativeByteOrder> Block;
@ -755,7 +385,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
switch (2*(m_buffer.size()>=64*1024)
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
+ HasSSE2()
//#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
//#elif CRYPTOPP_ARM_NEON_AVAILABLE
// + HasNEON()
#endif
)

108
neon.cpp Normal file
View File

@ -0,0 +1,108 @@
// crc-simd.cpp - written and placed in the public domain by
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
//
// This source file uses intrinsics to gain access to ARMv7a and
// ARMv8a NEON instructions. A separate source file is needed
// because additional CXXFLAGS are required to enable the
// appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
# include "arm_neon.h"
#endif
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>
# include <setjmp.h>
#endif
#ifndef EXCEPTION_EXECUTE_HANDLER
# define EXCEPTION_EXECUTE_HANDLER 1
#endif
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
extern "C" {
typedef void (*SigHandler)(int);
static jmp_buf s_jmpSIGILL;
static void SigIllHandler(int)
{
longjmp(s_jmpSIGILL, 1);
}
};
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
bool CPU_ProbeNEON()
{
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
uint32_t v1[4] = {1,1,1,1};
uint32x4_t x1 = vld1q_u32(v1);
uint64_t v2[2] = {1,1};
uint64x2_t x2 = vld1q_u64(v2);
uint32x4_t x3 = vdupq_n_u32(2);
x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
uint64x2_t x4 = vdupq_n_u64(2);
x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);
result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpSIGILL))
result = false;
else
{
uint32_t v1[4] = {1,1,1,1};
uint32x4_t x1 = vld1q_u32(v1);
uint64_t v2[2] = {1,1};
uint64x2_t x2 = vld1q_u64(v2);
uint32x4_t x3 = {0,0,0,0};
x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
uint64x2_t x4 = {0,0};
x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);
// Hack... GCC optimizes away the code and returns true
result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
return false;
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
}
NAMESPACE_END

705
rijndael-simd.cpp Normal file
View File

@ -0,0 +1,705 @@
// rijndael-simd.cpp - written and placed in the public domain by
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
//
// This source file uses intrinsics to gain access to AES-NI and
// ARMv8a AES instructions. A separate source file is needed
// because additional CXXFLAGS are required to enable the
// appropriate instructions sets in some build configurations.
//
// ARMv8a AES code based on CriticalBlue code from Johannes Schneiders,
// Skip Hovsmith and Barry O'Rourke for the mbedTLS project. Stepping
// mbedTLS under a debugger was helped for us to determine problems
// with our subkey generation and scheduling.
#include "pch.h"
#include "config.h"
#include "misc.h"
// Clang and GCC hoops...
#if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER))
# undef CRYPTOPP_ARM_AES_AVAILABLE
#endif
#if (CRYPTOPP_AESNI_AVAILABLE)
// Hack... We are supposed to use <nmmintrin.h>. GCC 4.8, LLVM Clang 3.5
// and Apple Clang 6.0 conflates SSE4.1 and SSE4.2. If we use <nmmintrin.h>
// then compile fails with "SSE4.2 instruction set not enabled". Also see
// https://gcc.gnu.org/ml/gcc-help/2017-08/msg00015.html.
# include "smmintrin.h"
# include "wmmintrin.h"
#endif
#if (CRYPTOPP_ARM_AES_AVAILABLE)
# include "arm_neon.h"
#endif
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
// intrinsics without <arm_acle.h> included.
#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
# include "arm_acle.h"
#endif
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>
# include <setjmp.h>
#endif
#ifndef EXCEPTION_EXECUTE_HANDLER
# define EXCEPTION_EXECUTE_HANDLER 1
#endif
// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
#if (__SUNPRO_CC >= 0x5130)
# define MAYBE_CONST
#else
# define MAYBE_CONST const
#endif
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
extern "C" {
typedef void (*SigHandler)(int);
static jmp_buf s_jmpSIGILL;
static void SigIllHandler(int)
{
longjmp(s_jmpSIGILL, 1);
}
};
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
bool CPU_ProbeAES()
{
#if (CRYPTOPP_ARM_AES_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
// AES encrypt and decrypt
uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
uint8x16_t r1 = vaeseq_u8(data, key);
uint8x16_t r2 = vaesdq_u8(data, key);
r1 = vaesmcq_u8(r1);
r2 = vaesimcq_u8(r2);
result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpSIGILL))
result = false;
else
{
uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
uint8x16_t r1 = vaeseq_u8(data, key);
uint8x16_t r2 = vaesdq_u8(data, key);
r1 = vaesmcq_u8(r1);
r2 = vaesimcq_u8(r2);
// Hack... GCC optimizes away the code and returns true
result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
return false;
#endif // CRYPTOPP_ARM_AES_AVAILABLE
}
#endif // ARM32 or ARM64
#if (CRYPTOPP_ARM_AES_AVAILABLE)
inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
{
const byte *keys = reinterpret_cast<const byte*>(subkeys);
// Unroll the loop, profit 0.3 to 0.5 cpb.
block = vaeseq_u8(block, vld1q_u8(keys+0));
block = vaesmcq_u8(block);
block = vaeseq_u8(block, vld1q_u8(keys+16));
block = vaesmcq_u8(block);
block = vaeseq_u8(block, vld1q_u8(keys+32));
block = vaesmcq_u8(block);
block = vaeseq_u8(block, vld1q_u8(keys+48));
block = vaesmcq_u8(block);
block = vaeseq_u8(block, vld1q_u8(keys+64));
block = vaesmcq_u8(block);
block = vaeseq_u8(block, vld1q_u8(keys+80));
block = vaesmcq_u8(block);
block = vaeseq_u8(block, vld1q_u8(keys+96));
block = vaesmcq_u8(block);
block = vaeseq_u8(block, vld1q_u8(keys+112));
block = vaesmcq_u8(block);
block = vaeseq_u8(block, vld1q_u8(keys+128));
block = vaesmcq_u8(block);
unsigned int i=9;
for ( ; i<rounds-1; ++i)
{
// AES single round encryption
block = vaeseq_u8(block, vld1q_u8(keys+i*16));
// AES mix columns
block = vaesmcq_u8(block);
}
// AES single round encryption
block = vaeseq_u8(block, vld1q_u8(keys+i*16));
// Final Add (bitwise Xor)
block = veorq_u8(block, vld1q_u8(keys+(i+1)*16));
}
inline void ARMV8_Enc_4_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
uint8x16_t &block3, const word32 *subkeys, unsigned int rounds)
{
const byte *keys = reinterpret_cast<const byte*>(subkeys);
unsigned int i=0;
for ( ; i<rounds-1; ++i)
{
// AES single round encryption
block0 = vaeseq_u8(block0, vld1q_u8(keys+i*16));
// AES mix columns
block0 = vaesmcq_u8(block0);
// AES single round encryption
block1 = vaeseq_u8(block1, vld1q_u8(keys+i*16));
// AES mix columns
block1 = vaesmcq_u8(block1);
// AES single round encryption
block2 = vaeseq_u8(block2, vld1q_u8(keys+i*16));
// AES mix columns
block2 = vaesmcq_u8(block2);
// AES single round encryption
block3 = vaeseq_u8(block3, vld1q_u8(keys+i*16));
// AES mix columns
block3 = vaesmcq_u8(block3);
}
// AES single round encryption
block0 = vaeseq_u8(block0, vld1q_u8(keys+i*16));
block1 = vaeseq_u8(block1, vld1q_u8(keys+i*16));
block2 = vaeseq_u8(block2, vld1q_u8(keys+i*16));
block3 = vaeseq_u8(block3, vld1q_u8(keys+i*16));
// Final Add (bitwise Xor)
block0 = veorq_u8(block0, vld1q_u8(keys+(i+1)*16));
block1 = veorq_u8(block1, vld1q_u8(keys+(i+1)*16));
block2 = veorq_u8(block2, vld1q_u8(keys+(i+1)*16));
block3 = veorq_u8(block3, vld1q_u8(keys+(i+1)*16));
}
inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
{
const byte *keys = reinterpret_cast<const byte*>(subkeys);
// Unroll the loop, profit 0.3 to 0.5 cpb.
block = vaesdq_u8(block, vld1q_u8(keys+0));
block = vaesimcq_u8(block);
block = vaesdq_u8(block, vld1q_u8(keys+16));
block = vaesimcq_u8(block);
block = vaesdq_u8(block, vld1q_u8(keys+32));
block = vaesimcq_u8(block);
block = vaesdq_u8(block, vld1q_u8(keys+48));
block = vaesimcq_u8(block);
block = vaesdq_u8(block, vld1q_u8(keys+64));
block = vaesimcq_u8(block);
block = vaesdq_u8(block, vld1q_u8(keys+80));
block = vaesimcq_u8(block);
block = vaesdq_u8(block, vld1q_u8(keys+96));
block = vaesimcq_u8(block);
block = vaesdq_u8(block, vld1q_u8(keys+112));
block = vaesimcq_u8(block);
block = vaesdq_u8(block, vld1q_u8(keys+128));
block = vaesimcq_u8(block);
unsigned int i=9;
for ( ; i<rounds-1; ++i)
{
// AES single round decryption
block = vaesdq_u8(block, vld1q_u8(keys+i*16));
// AES inverse mix columns
block = vaesimcq_u8(block);
}
// AES single round decryption
block = vaesdq_u8(block, vld1q_u8(keys+i*16));
// Final Add (bitwise Xor)
block = veorq_u8(block, vld1q_u8(keys+(i+1)*16));
}
inline void ARMV8_Dec_4_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
uint8x16_t &block3, const word32 *subkeys, unsigned int rounds)
{
const byte *keys = reinterpret_cast<const byte*>(subkeys);
unsigned int i=0;
for ( ; i<rounds-1; ++i)
{
// AES single round decryption
block0 = vaesdq_u8(block0, vld1q_u8(keys+i*16));
// AES inverse mix columns
block0 = vaesimcq_u8(block0);
// AES single round decryption
block1 = vaesdq_u8(block1, vld1q_u8(keys+i*16));
// AES inverse mix columns
block1 = vaesimcq_u8(block1);
// AES single round decryption
block2 = vaesdq_u8(block2, vld1q_u8(keys+i*16));
// AES inverse mix columns
block2 = vaesimcq_u8(block2);
// AES single round decryption
block3 = vaesdq_u8(block3, vld1q_u8(keys+i*16));
// AES inverse mix columns
block3 = vaesimcq_u8(block3);
}
// AES single round decryption
block0 = vaesdq_u8(block0, vld1q_u8(keys+i*16));
block1 = vaesdq_u8(block1, vld1q_u8(keys+i*16));
block2 = vaesdq_u8(block2, vld1q_u8(keys+i*16));
block3 = vaesdq_u8(block3, vld1q_u8(keys+i*16));
// Final Add (bitwise Xor)
block0 = veorq_u8(block0, vld1q_u8(keys+(i+1)*16));
block1 = veorq_u8(block1, vld1q_u8(keys+(i+1)*16));
block2 = veorq_u8(block2, vld1q_u8(keys+(i+1)*16));
block3 = veorq_u8(block3, vld1q_u8(keys+(i+1)*16));
}
const word32 s_one[] = {0, 0, 0, 1<<24};
template <typename F1, typename F4>
size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subkeys, unsigned int rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
size_t blockSize = 16;
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
size_t xorIncrement = xorBlocks ? blockSize : 0;
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
if (flags & BlockTransformation::BT_ReverseDirection)
{
inBlocks += length - blockSize;
xorBlocks += length - blockSize;
outBlocks += length - blockSize;
inIncrement = 0-inIncrement;
xorIncrement = 0-xorIncrement;
outIncrement = 0-outIncrement;
}
if (flags & BlockTransformation::BT_AllowParallel)
{
while (length >= 4*blockSize)
{
uint8x16_t block0, block1, block2, block3, temp;
block0 = vld1q_u8(inBlocks);
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
uint32x4_t be = vld1q_u32(s_one);
block1 = vaddq_u8(block0, vreinterpretq_u8_u32(be));
block2 = vaddq_u8(block1, vreinterpretq_u8_u32(be));
block3 = vaddq_u8(block2, vreinterpretq_u8_u32(be));
temp = vaddq_u8(block3, vreinterpretq_u8_u32(be));
vst1q_u8(const_cast<byte*>(inBlocks), temp);
}
else
{
inBlocks += inIncrement;
block1 = vld1q_u8(inBlocks);
inBlocks += inIncrement;
block2 = vld1q_u8(inBlocks);
inBlocks += inIncrement;
block3 = vld1q_u8(inBlocks);
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
block0 = veorq_u8(block0, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
block1 = veorq_u8(block1, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
block2 = veorq_u8(block2, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
block3 = veorq_u8(block3, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
}
func4(block0, block1, block2, block3, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = veorq_u8(block0, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
block1 = veorq_u8(block1, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
block2 = veorq_u8(block2, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
block3 = veorq_u8(block3, vld1q_u8(xorBlocks));
xorBlocks += xorIncrement;
}
vst1q_u8(outBlocks, block0);
outBlocks += outIncrement;
vst1q_u8(outBlocks, block1);
outBlocks += outIncrement;
vst1q_u8(outBlocks, block2);
outBlocks += outIncrement;
vst1q_u8(outBlocks, block3);
outBlocks += outIncrement;
length -= 4*blockSize;
}
}
while (length >= blockSize)
{
uint8x16_t block = vld1q_u8(inBlocks);
if (flags & BlockTransformation::BT_XorInput)
block = veorq_u8(block, vld1q_u8(xorBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
func1(block, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = veorq_u8(block, vld1q_u8(xorBlocks));
vst1q_u8(outBlocks, block);
inBlocks += inIncrement;
outBlocks += outIncrement;
xorBlocks += xorIncrement;
length -= blockSize;
}
return length;
}
size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return Rijndael_AdvancedProcessBlocks_ARMV8(ARMV8_Enc_Block, ARMV8_Enc_4_Blocks,
subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return Rijndael_AdvancedProcessBlocks_ARMV8(ARMV8_Dec_Block, ARMV8_Dec_4_Blocks,
subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_ARM_AES_AVAILABLE
#if (CRYPTOPP_AESNI_AVAILABLE)
inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesenc_si128(block, subkeys[i]);
block = _mm_aesenc_si128(block, subkeys[i+1]);
}
block = _mm_aesenc_si128(block, subkeys[rounds-1]);
block = _mm_aesenclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesenc_si128(block0, rk);
block1 = _mm_aesenc_si128(block1, rk);
block2 = _mm_aesenc_si128(block2, rk);
block3 = _mm_aesenc_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesenclast_si128(block0, rk);
block1 = _mm_aesenclast_si128(block1, rk);
block2 = _mm_aesenclast_si128(block2, rk);
block3 = _mm_aesenclast_si128(block3, rk);
}
inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesdec_si128(block, subkeys[i]);
block = _mm_aesdec_si128(block, subkeys[i+1]);
}
block = _mm_aesdec_si128(block, subkeys[rounds-1]);
block = _mm_aesdeclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesdec_si128(block0, rk);
block1 = _mm_aesdec_si128(block1, rk);
block2 = _mm_aesdec_si128(block2, rk);
block3 = _mm_aesdec_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesdeclast_si128(block0, rk);
block1 = _mm_aesdeclast_si128(block1, rk);
block2 = _mm_aesdeclast_si128(block2, rk);
block3 = _mm_aesdeclast_si128(block3, rk);
}
CRYPTOPP_ALIGN_DATA(16)
static const word32 s_one[] = {0, 0, 0, 1<<24};
template <typename F1, typename F4>
inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
size_t blockSize = 16;
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
size_t xorIncrement = xorBlocks ? blockSize : 0;
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
MAYBE_CONST __m128i *subkeys = reinterpret_cast<MAYBE_CONST __m128i*>(subKeys);
if (flags & BlockTransformation::BT_ReverseDirection)
{
inBlocks += length - blockSize;
xorBlocks += length - blockSize;
outBlocks += length - blockSize;
inIncrement = 0-inIncrement;
xorIncrement = 0-xorIncrement;
outIncrement = 0-outIncrement;
}
if (flags & BlockTransformation::BT_AllowParallel)
{
while (length >= 4*blockSize)
{
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *CONST_M128_CAST(s_one);
block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
func4(block0, block1, block2, block3, subkeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
_mm_storeu_si128(M128_CAST(outBlocks), block0);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block1);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block2);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block3);
outBlocks += outIncrement;
length -= 4*blockSize;
}
}
while (length >= blockSize)
{
__m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BlockTransformation::BT_XorInput)
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
func1(block, subkeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
_mm_storeu_si128(M128_CAST(outBlocks), block);
inBlocks += inIncrement;
outBlocks += outIncrement;
xorBlocks += xorIncrement;
length -= blockSize;
}
return length;
}
size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(MAYBE_CONST word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Enc_Block, AESNI_Enc_4_Blocks,
subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(MAYBE_CONST word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Dec_Block, AESNI_Dec_4_Blocks,
subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk)
{
const unsigned rounds = static_cast<unsigned int>(keyLen/4 + 6);
static const word32 rcLE[] = {
0x01, 0x02, 0x04, 0x08,
0x10, 0x20, 0x40, 0x80,
0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
};
const word32 *ro = rcLE, *rc = rcLE;
CRYPTOPP_UNUSED(ro);
__m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16));
std::memcpy(rk, userKey, keyLen);
// keySize: m_key allocates 4*(rounds+1 word32's.
const size_t keySize = 4*(rounds+1);
const word32* end = rk + keySize;
while (true)
{
CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE));
rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
if (rk + keyLen/4 + 4 == end)
break;
if (keyLen == 24)
{
rk[10] = rk[ 4] ^ rk[ 9];
rk[11] = rk[ 5] ^ rk[10];
CRYPTOPP_ASSERT(keySize >= 12);
temp = _mm_insert_epi32(temp, rk[11], 3);
}
else if (keyLen == 32)
{
CRYPTOPP_ASSERT(keySize >= 12);
temp = _mm_insert_epi32(temp, rk[11], 3);
rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
rk[13] = rk[ 5] ^ rk[12];
rk[14] = rk[ 6] ^ rk[13];
rk[15] = rk[ 7] ^ rk[14];
CRYPTOPP_ASSERT(keySize >= 16);
temp = _mm_insert_epi32(temp, rk[15], 3);
}
else
{
CRYPTOPP_ASSERT(keySize >= 8);
temp = _mm_insert_epi32(temp, rk[7], 3);
}
rk += keyLen/4;
}
}
void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds)
{
unsigned int i, j;
__m128i temp;
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
// __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
// SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
vec_swap(*(__m128i *)(key), *(__m128i *)(key+4*rounds));
#else
std::swap(*M128_CAST(key), *M128_CAST(key+4*rounds));
#endif
for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4)
{
temp = _mm_aesimc_si128(*M128_CAST(key+i));
*M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+j));
*M128_CAST(key+j) = temp;
}
*M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+i));
}
#endif // CRYPTOPP_AESNI_AVAILABLE
NAMESPACE_END

View File

@ -5,7 +5,7 @@
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
/*
August 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
July 2017: Added support for ARM AES instructions via compiler intrinsics.
*/
/*
@ -85,13 +85,6 @@ NAMESPACE_BEGIN(CryptoPP)
# define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
static void Rijndael_Enc_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock,
const word32 *subKeys, unsigned int rounds);
static void Rijndael_Dec_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock,
const word32 *subKeys, unsigned int rounds);
#endif
// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
#if (__SUNPRO_CC >= 0x5130)
# define MAYBE_CONST
@ -229,123 +222,68 @@ void Rijndael::Base::FillDecTable()
s_TdFilled = true;
}
void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
{
AssertValidKeyLength(keylen);
#if (CRYPTOPP_AESNI_AVAILABLE)
extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
extern void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds);
m_rounds = keylen/4 + 6;
extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_ARM_AES_AVAILABLE)
extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
{
AssertValidKeyLength(keyLen);
m_rounds = keyLen/4 + 6;
m_key.New(4*(m_rounds+1));
word32 *rk = m_key;
#if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
#if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
// MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
if (HasAESNI() && HasSSE4())
if (HasAESNI() && HasSSE41())
{
static const word32 rcLE[] = {
0x01, 0x02, 0x04, 0x08,
0x10, 0x20, 0x40, 0x80,
0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
};
// Coverity finding, appears to be false positive. Assert the condition.
const word32 *ro = rcLE, *rc = rcLE;
CRYPTOPP_UNUSED(ro);
__m128i temp = _mm_loadu_si128(M128I_CAST(userKey+keylen-16));
memcpy(rk, userKey, keylen);
while (true)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE));
rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
if (rk + keylen/4 + 4 == m_key.end())
break;
if (keylen == 24)
{
rk[10] = rk[ 4] ^ rk[ 9];
rk[11] = rk[ 5] ^ rk[10];
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(m_key.size() >= 12);
temp = _mm_insert_epi32(temp, rk[11], 3);
}
else if (keylen == 32)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(m_key.size() >= 12);
temp = _mm_insert_epi32(temp, rk[11], 3);
rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
rk[13] = rk[ 5] ^ rk[12];
rk[14] = rk[ 6] ^ rk[13];
rk[15] = rk[ 7] ^ rk[14];
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(m_key.size() >= 16);
temp = _mm_insert_epi32(temp, rk[15], 3);
}
else
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(m_key.size() >= 8);
temp = _mm_insert_epi32(temp, rk[7], 3);
}
rk += keylen/4;
}
// TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
// Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
if (!IsForwardTransformation())
{
rk = m_key;
unsigned int i, j;
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
// __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
// SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
#else
std::swap(*M128I_CAST(rk), *M128I_CAST(rk+4*m_rounds));
#endif
for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
{
temp = _mm_aesimc_si128(*M128I_CAST(rk+i));
*M128I_CAST(rk+i) = _mm_aesimc_si128(*M128I_CAST(rk+j));
*M128I_CAST(rk+j) = temp;
}
*M128I_CAST(rk+i) = _mm_aesimc_si128(*M128I_CAST(rk+i));
}
Rijndael_UncheckedSetKeyRev_SSE4_AESNI(m_key, m_rounds);
return;
}
#endif
GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
const word32 *rc = rcon;
word32 temp;
while (true)
{
temp = rk[keylen/4-1];
temp = rk[keyLen/4-1];
word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
rk[keylen/4] = rk[0] ^ x ^ *(rc++);
rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
if (rk + keylen/4 + 4 == m_key.end())
if (rk + keyLen/4 + 4 == m_key.end())
break;
if (keylen == 24)
if (keyLen == 24)
{
rk[10] = rk[ 4] ^ rk[ 9];
rk[11] = rk[ 5] ^ rk[10];
}
else if (keylen == 32)
else if (keyLen == 32)
{
temp = rk[11];
rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
@ -353,7 +291,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
rk[14] = rk[ 6] ^ rk[13];
rk[15] = rk[ 7] ^ rk[14];
}
rk += keylen/4;
rk += keyLen/4;
}
rk = m_key;
@ -394,11 +332,11 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
}
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_AESNI_AVAILABLE
if (HasAESNI())
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
#if CRYPTOPP_ARM_AES_AVAILABLE
if (HasAES())
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
#endif
@ -406,20 +344,22 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
# if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
if (HasSSE2())
#else
# else
if (HasAESNI())
#endif
# endif
{
return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
return;
}
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
#if (CRYPTOPP_ARM_AES_AVAILABLE)
if (HasAES())
{
Rijndael_Enc_ProcessAndXorBlock_ARMV8(inBlock, xorBlock, outBlock, m_key.begin(), m_rounds);
(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
return;
}
#endif
@ -494,17 +434,18 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_AESNI_AVAILABLE
if (HasAESNI())
{
Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
return;
}
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
#if (CRYPTOPP_ARM_AES_AVAILABLE)
if (HasAES())
{
Rijndael_Dec_ProcessAndXorBlock_ARMV8(inBlock, xorBlock, outBlock, m_key.begin(), m_rounds);
(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
return;
}
#endif
@ -1115,191 +1056,6 @@ static inline bool AliasedWithTable(const byte *begin, const byte *end)
return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
}
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesenc_si128(block, subkeys[i]);
block = _mm_aesenc_si128(block, subkeys[i+1]);
}
block = _mm_aesenc_si128(block, subkeys[rounds-1]);
block = _mm_aesenclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesenc_si128(block0, rk);
block1 = _mm_aesenc_si128(block1, rk);
block2 = _mm_aesenc_si128(block2, rk);
block3 = _mm_aesenc_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesenclast_si128(block0, rk);
block1 = _mm_aesenclast_si128(block1, rk);
block2 = _mm_aesenclast_si128(block2, rk);
block3 = _mm_aesenclast_si128(block3, rk);
}
inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesdec_si128(block, subkeys[i]);
block = _mm_aesdec_si128(block, subkeys[i+1]);
}
block = _mm_aesdec_si128(block, subkeys[rounds-1]);
block = _mm_aesdeclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesdec_si128(block0, rk);
block1 = _mm_aesdec_si128(block1, rk);
block2 = _mm_aesdec_si128(block2, rk);
block3 = _mm_aesdec_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesdeclast_si128(block0, rk);
block1 = _mm_aesdeclast_si128(block1, rk);
block2 = _mm_aesdeclast_si128(block2, rk);
block3 = _mm_aesdeclast_si128(block3, rk);
}
CRYPTOPP_ALIGN_DATA(16)
static const word32 s_one[] = {0, 0, 0, 1<<24};
template <typename F1, typename F4>
inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
size_t blockSize = 16;
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
size_t xorIncrement = xorBlocks ? blockSize : 0;
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
if (flags & BlockTransformation::BT_ReverseDirection)
{
CRYPTOPP_ASSERT(length % blockSize == 0);
inBlocks += length - blockSize;
xorBlocks += length - blockSize;
outBlocks += length - blockSize;
inIncrement = 0-inIncrement;
xorIncrement = 0-xorIncrement;
outIncrement = 0-outIncrement;
}
if (flags & BlockTransformation::BT_AllowParallel)
{
while (length >= 4*blockSize)
{
__m128i block0 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks)), block1, block2, block3;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *CONST_M128I_CAST(s_one);
block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128(M128I_CAST(inBlocks), _mm_add_epi32(block3, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks));
inBlocks += inIncrement;
block2 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks));
inBlocks += inIncrement;
block3 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks));
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
func4(block0, block1, block2, block3, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
_mm_storeu_si128(M128I_CAST(outBlocks), block0);
outBlocks += outIncrement;
_mm_storeu_si128(M128I_CAST(outBlocks), block1);
outBlocks += outIncrement;
_mm_storeu_si128(M128I_CAST(outBlocks), block2);
outBlocks += outIncrement;
_mm_storeu_si128(M128I_CAST(outBlocks), block3);
outBlocks += outIncrement;
length -= 4*blockSize;
}
}
while (length >= blockSize)
{
__m128i block = _mm_loadu_si128(CONST_M128I_CAST(inBlocks));
if (flags & BlockTransformation::BT_XorInput)
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
func1(block, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
_mm_storeu_si128(M128I_CAST(outBlocks), block);
inBlocks += inIncrement;
outBlocks += outIncrement;
xorBlocks += xorIncrement;
length -= blockSize;
}
return length;
}
#endif
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
struct Locals
{
word32 subkeys[4*12], workspace[8];
@ -1314,13 +1070,24 @@ const size_t s_aliasBlockSize = 256;
const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { }
#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
// Do nothing
Rijndael::Enc::Enc() { }
#endif
#if CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS
size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
{
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_AESNI_AVAILABLE
if (HasAESNI())
return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if CRYPTOPP_ARM_AES_AVAILABLE
if (HasAES())
return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
@ -1375,116 +1142,21 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
{
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#if CRYPTOPP_AESNI_AVAILABLE
if (HasAESNI())
return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if CRYPTOPP_ARM_AES_AVAILABLE
if (HasAES())
return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
void Rijndael_Enc_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock,
const word32 *subKeys, unsigned int rounds)
{
uint8x16_t data = vld1q_u8(inBlock);
const byte *keys = reinterpret_cast<const byte*>(subKeys);
// Unroll the loop, profit 0.3 to 0.5 cpb.
data = vaeseq_u8(data, vld1q_u8(keys+0));
data = vaesmcq_u8(data);
data = vaeseq_u8(data, vld1q_u8(keys+16));
data = vaesmcq_u8(data);
data = vaeseq_u8(data, vld1q_u8(keys+32));
data = vaesmcq_u8(data);
data = vaeseq_u8(data, vld1q_u8(keys+48));
data = vaesmcq_u8(data);
data = vaeseq_u8(data, vld1q_u8(keys+64));
data = vaesmcq_u8(data);
data = vaeseq_u8(data, vld1q_u8(keys+80));
data = vaesmcq_u8(data);
data = vaeseq_u8(data, vld1q_u8(keys+96));
data = vaesmcq_u8(data);
data = vaeseq_u8(data, vld1q_u8(keys+112));
data = vaesmcq_u8(data);
data = vaeseq_u8(data, vld1q_u8(keys+128));
data = vaesmcq_u8(data);
unsigned int i=9;
for ( ; i<rounds-1; ++i)
{
// AES single round encryption
data = vaeseq_u8(data, vld1q_u8(keys+i*16));
// AES mix columns
data = vaesmcq_u8(data);
}
// AES single round encryption
data = vaeseq_u8(data, vld1q_u8(keys+i*16));
// Final Add (bitwise Xor)
data = veorq_u8(data, vld1q_u8(keys+(i+1)*16));
if (xorBlock)
vst1q_u8(outBlock, veorq_u8(data, vld1q_u8(xorBlock)));
else
vst1q_u8(outBlock, data);
}
void Rijndael_Dec_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock,
const word32 *subKeys, unsigned int rounds)
{
uint8x16_t data = vld1q_u8(inBlock);
const byte *keys = reinterpret_cast<const byte*>(subKeys);
// Unroll the loop, profit 0.3 to 0.5 cpb.
data = vaesdq_u8(data, vld1q_u8(keys+0));
data = vaesimcq_u8(data);
data = vaesdq_u8(data, vld1q_u8(keys+16));
data = vaesimcq_u8(data);
data = vaesdq_u8(data, vld1q_u8(keys+32));
data = vaesimcq_u8(data);
data = vaesdq_u8(data, vld1q_u8(keys+48));
data = vaesimcq_u8(data);
data = vaesdq_u8(data, vld1q_u8(keys+64));
data = vaesimcq_u8(data);
data = vaesdq_u8(data, vld1q_u8(keys+80));
data = vaesimcq_u8(data);
data = vaesdq_u8(data, vld1q_u8(keys+96));
data = vaesimcq_u8(data);
data = vaesdq_u8(data, vld1q_u8(keys+112));
data = vaesimcq_u8(data);
data = vaesdq_u8(data, vld1q_u8(keys+128));
data = vaesimcq_u8(data);
unsigned int i=9;
for ( ; i<rounds-1; ++i)
{
// AES single round decryption
data = vaesdq_u8(data, vld1q_u8(keys+i*16));
// AES inverse mix columns
data = vaesimcq_u8(data);
}
// AES single round decryption
data = vaesdq_u8(data, vld1q_u8(keys+i*16));
// Final Add (bitwise Xor)
data = veorq_u8(data, vld1q_u8(keys+(i+1)*16));
if (xorBlock)
vst1q_u8(outBlock, veorq_u8(data, vld1q_u8(xorBlock)));
else
vst1q_u8(outBlock, data);
}
#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
#endif // CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS
NAMESPACE_END

View File

@ -17,6 +17,10 @@
# define CRYPTOPP_DISABLE_RIJNDAEL_ASM
#endif
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
# define CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS 1
#endif
NAMESPACE_BEGIN(CryptoPP)
//! \brief Rijndael block cipher information
@ -56,7 +60,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
#if CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS
Enc();
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
private:
@ -70,7 +74,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
#if CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};

961
sha-simd.cpp Normal file
View File

@ -0,0 +1,961 @@
// sha-simd.cpp - written and placed in the public domain by
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
//
// This source file uses intrinsics to gain access to SHA-NI and
// ARMv8a SHA instructions. A separate source file is needed
// because additional CXXFLAGS are required to enable the
// appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "sha.h"
#include "misc.h"
// Clang and GCC hoops...
#if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER))
# undef CRYPTOPP_ARM_SHA_AVAILABLE
#endif
#if (CRYPTOPP_SHANI_AVAILABLE)
# include "nmmintrin.h"
# include "immintrin.h"
#endif
#if (CRYPTOPP_ARM_SHA_AVAILABLE)
# include "arm_neon.h"
#endif
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
// intrinsics without <arm_acle.h> included.
#if (CRYPTOPP_ARM_SHA_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
# include "arm_acle.h"
#endif
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>
# include <setjmp.h>
#endif
#ifndef EXCEPTION_EXECUTE_HANDLER
# define EXCEPTION_EXECUTE_HANDLER 1
#endif
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
extern "C" {
typedef void (*SigHandler)(int);
static jmp_buf s_jmpSIGILL;
static void SigIllHandler(int)
{
longjmp(s_jmpSIGILL, 1);
}
};
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
bool CPU_ProbeSHA1()
{
#if (CRYPTOPP_ARM_SHA_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpSIGILL))
result = false;
else
{
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
return false;
#endif // CRYPTOPP_ARM_SHA_AVAILABLE
}
bool CPU_ProbeSHA2()
{
#if (CRYPTOPP_ARM_SHA_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
#else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpSIGILL))
result = false;
else
{
uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
#else
return false;
#endif // CRYPTOPP_ARM_SHA_AVAILABLE
}
#endif // ARM32 or ARM64
extern const word32 SHA256_K[64];
///////////////////////////////////
// start of Walton/Gulley's code //
///////////////////////////////////
#if CRYPTOPP_SHANI_AVAILABLE
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
__m128i MASK, MSG0, MSG1, MSG2, MSG3;
// Load initial values
ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
E0 = _mm_set_epi32(state[4], 0, 0, 0);
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
// IA-32 SHA is little endian, SHA::Transform is big endian,
// and SHA::HashMultipleBlocks can be either. ByteOrder
// allows us to avoid extra endian reversals. It saves 1.0 cpb.
MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
_mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
_mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
while (length >= SHA1::BLOCKSIZE)
{
// Save current hash
ABCD_SAVE = ABCD;
E0_SAVE = E0;
// Rounds 0-3
MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
MSG0 = _mm_shuffle_epi8(MSG0, MASK);
E0 = _mm_add_epi32(E0, MSG0);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
// Rounds 4-7
MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
// Rounds 8-11
MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 12-15
MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 16-19
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 20-23
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 24-27
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 28-31
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 32-35
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 36-39
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 40-43
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 44-47
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 48-51
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 52-55
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 56-59
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 60-63
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 64-67
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 68-71
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 72-75
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
// Rounds 76-79
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
// Add values back to state
E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
data += SHA1::BLOCKSIZE/sizeof(word32);
length -= SHA1::BLOCKSIZE;
}
// Save state
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
_mm_storeu_si128(M128_CAST(state), ABCD);
state[4] = _mm_extract_epi32(E0, 3);
}
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_loadu_si128(M128_CAST(&state[0]));
STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
// IA-32 SHA is little endian, SHA::Transform is big endian,
// and SHA::HashMultipleBlocks can be either. ByteOrder
// allows us to avoid extra endian reversals. It saves 1.0 cpb.
MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
_mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
while (length >= SHA256::BLOCKSIZE)
{
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add values back to state
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
data += SHA256::BLOCKSIZE/sizeof(word32);
length -= SHA256::BLOCKSIZE;
}
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_storeu_si128(M128_CAST(&state[0]), STATE0);
_mm_storeu_si128(M128_CAST(&state[4]), STATE1);
}
#endif // CRYPTOPP_SHANI_AVAILABLE
/////////////////////////////////
// end of Walton/Gulley's code //
/////////////////////////////////
/////////////////////////////////////////////////////////
// start of Walton/Schneiders/O'Rourke/Hovsmith's code //
/////////////////////////////////////////////////////////
#if CRYPTOPP_ARM_SHA_AVAILABLE
void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
uint32x4_t C0, C1, C2, C3;
uint32x4_t ABCD, ABCD_SAVED;
uint32x4_t MSG0, MSG1, MSG2, MSG3;
uint32x4_t TMP0, TMP1;
uint32_t E0, E0_SAVED, E1;
// Load initial values
C0 = vdupq_n_u32(0x5A827999);
C1 = vdupq_n_u32(0x6ED9EBA1);
C2 = vdupq_n_u32(0x8F1BBCDC);
C3 = vdupq_n_u32(0xCA62C1D6);
ABCD = vld1q_u32(&state[0]);
E0 = state[4];
while (length >= SHA1::BLOCKSIZE)
{
// Save current hash
ABCD_SAVED = ABCD;
E0_SAVED = E0;
MSG0 = vld1q_u32(data + 0);
MSG1 = vld1q_u32(data + 4);
MSG2 = vld1q_u32(data + 8);
MSG3 = vld1q_u32(data + 12);
if (order == BIG_ENDIAN_ORDER) // Data arrangement
{
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
}
TMP0 = vaddq_u32(MSG0, C0);
TMP1 = vaddq_u32(MSG1, C0);
// Rounds 0-3
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1cq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG2, C0);
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
// Rounds 4-7
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1cq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG3, C0);
MSG0 = vsha1su1q_u32(MSG0, MSG3);
MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
// Rounds 8-11
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1cq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG0, C0);
MSG1 = vsha1su1q_u32(MSG1, MSG0);
MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
// Rounds 12-15
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1cq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG1, C1);
MSG2 = vsha1su1q_u32(MSG2, MSG1);
MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
// Rounds 16-19
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1cq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG2, C1);
MSG3 = vsha1su1q_u32(MSG3, MSG2);
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
// Rounds 20-23
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG3, C1);
MSG0 = vsha1su1q_u32(MSG0, MSG3);
MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
// Rounds 24-27
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG0, C1);
MSG1 = vsha1su1q_u32(MSG1, MSG0);
MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
// Rounds 28-31
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG1, C1);
MSG2 = vsha1su1q_u32(MSG2, MSG1);
MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
// Rounds 32-35
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG2, C2);
MSG3 = vsha1su1q_u32(MSG3, MSG2);
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
// Rounds 36-39
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG3, C2);
MSG0 = vsha1su1q_u32(MSG0, MSG3);
MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
// Rounds 40-43
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1mq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG0, C2);
MSG1 = vsha1su1q_u32(MSG1, MSG0);
MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
// Rounds 44-47
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1mq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG1, C2);
MSG2 = vsha1su1q_u32(MSG2, MSG1);
MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
// Rounds 48-51
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1mq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG2, C2);
MSG3 = vsha1su1q_u32(MSG3, MSG2);
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
// Rounds 52-55
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1mq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG3, C3);
MSG0 = vsha1su1q_u32(MSG0, MSG3);
MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
// Rounds 56-59
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1mq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG0, C3);
MSG1 = vsha1su1q_u32(MSG1, MSG0);
MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
// Rounds 60-63
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG1, C3);
MSG2 = vsha1su1q_u32(MSG2, MSG1);
MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
// Rounds 64-67
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E0, TMP0);
TMP0 = vaddq_u32(MSG2, C3);
MSG3 = vsha1su1q_u32(MSG3, MSG2);
MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
// Rounds 68-71
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
TMP1 = vaddq_u32(MSG3, C3);
MSG0 = vsha1su1q_u32(MSG0, MSG3);
// Rounds 72-75
E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E0, TMP0);
// Rounds 76-79
E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
ABCD = vsha1pq_u32(ABCD, E1, TMP1);
E0 += E0_SAVED;
ABCD = vaddq_u32(ABCD_SAVED, ABCD);
data += SHA1::BLOCKSIZE/sizeof(word32);
length -= SHA1::BLOCKSIZE;
}
// Save state
vst1q_u32(&state[0], ABCD);
state[4] = E0;
}
void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
uint32x4_t MSG0, MSG1, MSG2, MSG3;
uint32x4_t TMP0, TMP1, TMP2;
// Load initial values
STATE0 = vld1q_u32(&state[0]);
STATE1 = vld1q_u32(&state[4]);
while (length >= SHA256::BLOCKSIZE)
{
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Load message
MSG0 = vld1q_u32(data + 0);
MSG1 = vld1q_u32(data + 4);
MSG2 = vld1q_u32(data + 8);
MSG3 = vld1q_u32(data + 12);
if (order == BIG_ENDIAN_ORDER) // Data arrangement
{
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
}
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
// Rounds 0-3
MSG0 = vsha256su0q_u32(MSG0, MSG1);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
// Rounds 4-7
MSG1 = vsha256su0q_u32(MSG1, MSG2);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
// Rounds 8-11
MSG2 = vsha256su0q_u32(MSG2, MSG3);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
// Rounds 12-15
MSG3 = vsha256su0q_u32(MSG3, MSG0);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
// Rounds 16-19
MSG0 = vsha256su0q_u32(MSG0, MSG1);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
// Rounds 20-23
MSG1 = vsha256su0q_u32(MSG1, MSG2);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
// Rounds 24-27
MSG2 = vsha256su0q_u32(MSG2, MSG3);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
// Rounds 28-31
MSG3 = vsha256su0q_u32(MSG3, MSG0);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
// Rounds 32-35
MSG0 = vsha256su0q_u32(MSG0, MSG1);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
// Rounds 36-39
MSG1 = vsha256su0q_u32(MSG1, MSG2);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
// Rounds 40-43
MSG2 = vsha256su0q_u32(MSG2, MSG3);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
// Rounds 44-47
MSG3 = vsha256su0q_u32(MSG3, MSG0);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
// Rounds 48-51
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
// Rounds 52-55
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
// Rounds 56-59
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
// Rounds 60-63
TMP2 = STATE0;
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
// Add back to state
STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
data += SHA256::BLOCKSIZE/sizeof(word32);
length -= SHA256::BLOCKSIZE;
}
// Save state
vst1q_u32(&state[0], STATE0);
vst1q_u32(&state[4], STATE1);
}
#endif
///////////////////////////////////////////////////////
// end of Walton/Schneiders/O'Rourke/Hovsmith's code //
///////////////////////////////////////////////////////
NAMESPACE_END

933
sha.cpp

File diff suppressed because it is too large Load Diff

111
shacal2-simd.cpp Normal file
View File

@ -0,0 +1,111 @@
// shacla2-simd.cpp - written and placed in the public domain by
// Jeffrey Walton and Jack Lloyd
//
// Jack Lloyd and the Botan team allowed Crypto++ to use parts of
// Botan's implementation under the same license as Crypto++
// is released. The code for SHACAL2_Enc_ProcessAndXorBlock_SHANI
// below is Botan's x86_encrypt_blocks with minor tweaks. Many thanks
// to the Botan team. Also see http://github.com/randombit/botan/.
//
// This source file uses intrinsics to gain access to SHA-NI and
// ARMv8a SHA instructions. A separate source file is needed because
// additional CXXFLAGS are required to enable the appropriate instructions
// sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "sha.h"
#include "misc.h"
// Clang and GCC hoops...
#if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER))
# undef CRYPTOPP_ARM_SHA_AVAILABLE
#endif
#if (CRYPTOPP_SHANI_AVAILABLE)
# include "nmmintrin.h"
# include "immintrin.h"
#endif
#if (CRYPTOPP_ARM_SHA_AVAILABLE)
# include "arm_neon.h"
#endif
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
// intrinsics without <arm_acle.h> included.
#if (CRYPTOPP_ARM_SHA_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
# include "arm_acle.h"
#endif
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
NAMESPACE_BEGIN(CryptoPP)
#if CRYPTOPP_SHANI_AVAILABLE
void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys, const byte *inBlock, const byte *xorBlock, byte *outBlock)
{
CRYPTOPP_ASSERT(subKeys);
CRYPTOPP_ASSERT(inBlock);
CRYPTOPP_ASSERT(outBlock);
const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15);
__m128i B0 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 0)), MASK1);
__m128i B1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 16)), MASK2);
__m128i TMP = _mm_alignr_epi8(B0, B1, 8);
B1 = _mm_blend_epi16(B1, B0, 0xF0);
B0 = TMP;
#if 0
// SSE2 + SSSE3, but 0.2 cpb slower on a Celeraon J3455
const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15);
__m128i B0 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 0));
__m128i B1 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 16));
__m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2);
B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2);
B0 = TMP;
#endif
const byte* keys = reinterpret_cast<const byte*>(subKeys);
for (size_t i = 0; i != 8; ++i)
{
const __m128i RK0 = _mm_load_si128(CONST_M128_CAST(keys + 32*i));
const __m128i RK2 = _mm_load_si128(CONST_M128_CAST(keys + 32*i+16));
const __m128i RK1 = _mm_srli_si128(RK0, 8);
const __m128i RK3 = _mm_srli_si128(RK2, 8);
B1 = _mm_sha256rnds2_epu32(B1, B0, RK0);
B0 = _mm_sha256rnds2_epu32(B0, B1, RK1);
B1 = _mm_sha256rnds2_epu32(B1, B0, RK2);
B0 = _mm_sha256rnds2_epu32(B0, B1, RK3);
}
TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1);
B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1);
B0 = TMP;
if (xorBlock)
{
_mm_storeu_si128(M128_CAST(outBlock + 0),
_mm_xor_si128(B0, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 0))));
_mm_storeu_si128(M128_CAST(outBlock + 16),
_mm_xor_si128(B1, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 16))));
}
else
{
_mm_storeu_si128(M128_CAST(outBlock + 0), B0);
_mm_storeu_si128(M128_CAST(outBlock + 16), B1);
}
}
#endif
NAMESPACE_END

View File

@ -14,87 +14,12 @@
#include "pch.h"
#include "config.h"
#include "shacal2.h"
#include "cpu.h"
#include "misc.h"
#include "cpu.h"
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
#include <immintrin.h>
#endif
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
NAMESPACE_BEGIN(CryptoPP)
ANONYMOUS_NAMESPACE_BEGIN
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys, const byte *inBlock, const byte *xorBlock, byte *outBlock)
{
CRYPTOPP_ASSERT(subKeys);
CRYPTOPP_ASSERT(inBlock);
CRYPTOPP_ASSERT(outBlock);
const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15);
__m128i B0 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 0)), MASK1);
__m128i B1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 16)), MASK2);
__m128i TMP = _mm_alignr_epi8(B0, B1, 8);
B1 = _mm_blend_epi16(B1, B0, 0xF0);
B0 = TMP;
#if 0
// SSE2 + SSSE3, but 0.2 cpb slower on a Celeraon J3455
const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15);
__m128i B0 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 0));
__m128i B1 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 16));
__m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2);
B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2);
B0 = TMP;
#endif
const byte* keys = reinterpret_cast<const byte*>(subKeys);
for (size_t i = 0; i != 8; ++i)
{
const __m128i RK0 = _mm_load_si128(CONST_M128_CAST(keys + 32*i));
const __m128i RK2 = _mm_load_si128(CONST_M128_CAST(keys + 32*i+16));
const __m128i RK1 = _mm_srli_si128(RK0, 8);
const __m128i RK3 = _mm_srli_si128(RK2, 8);
B1 = _mm_sha256rnds2_epu32(B1, B0, RK0);
B0 = _mm_sha256rnds2_epu32(B0, B1, RK1);
B1 = _mm_sha256rnds2_epu32(B1, B0, RK2);
B0 = _mm_sha256rnds2_epu32(B0, B1, RK3);
}
TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1);
B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1);
B0 = TMP;
if (xorBlock)
{
_mm_storeu_si128(M128_CAST(outBlock + 0),
_mm_xor_si128(B0, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 0))));
_mm_storeu_si128(M128_CAST(outBlock + 16),
_mm_xor_si128(B1, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 16))));
}
else
{
_mm_storeu_si128(M128_CAST(outBlock + 0), B0);
_mm_storeu_si128(M128_CAST(outBlock + 16), B1);
}
}
#endif
ANONYMOUS_NAMESPACE_END
// SHACAL-2 function and round definitions
#define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
@ -115,6 +40,11 @@ ANONYMOUS_NAMESPACE_END
#define P(a,b,c,d,e,f,g,h,k) \
h-=S0(a)+Maj(a,b,c);d-=h;h-=S1(e)+Ch(e,f,g)+*--k;
#if CRYPTOPP_SHANI_AVAILABLE
extern void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys,
const byte *inBlock, const byte *xorBlock, byte *outBlock);
#endif
void SHACAL2::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
{
AssertValidKeyLength(keylen);
@ -138,7 +68,7 @@ typedef BlockGetAndPut<word32, BigEndian> Block;
void SHACAL2::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
#if CRYPTOPP_SHANI_AVAILABLE
if (HasSHA())
{
SHACAL2_Enc_ProcessAndXorBlock_SHANI(m_key, inBlock, xorBlock, outBlock);

View File

@ -344,7 +344,8 @@ bool TestSettings()
#ifdef CRYPTOPP_CPUID_AVAILABLE
bool hasSSE2 = HasSSE2();
bool hasSSSE3 = HasSSSE3();
bool hasSSE4 = HasSSE4();
bool hasSSE41 = HasSSE41();
bool hasSSE42 = HasSSE42();
bool isP4 = IsP4();
int cacheLineSize = GetCacheLineSize();
@ -356,21 +357,21 @@ bool TestSettings()
else
std::cout << "passed: ";
std::cout << "hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasSSE4 == " << hasSSE4;
std::cout << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL();
std::cout << ", hasRDRAND == " << HasRDRAND() << ", hasRDSEED == " << HasRDSEED();
std::cout << "hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasSSE4.1 == " << hasSSE41 << ", hasSSE4.2 == " << hasSSE42;
std::cout << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", hasRDRAND == " << HasRDRAND() << ", hasRDSEED == " << HasRDSEED();
std::cout << ", hasSHA == " << HasSHA() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize << std::endl;
#elif (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
bool hasNEON = HasNEON();
bool hasPMULL = HasPMULL();
bool hasCRC32 = HasCRC32();
bool hasPMULL = HasPMULL();
bool hasAES = HasAES();
bool hasSHA1 = HasSHA1();
bool hasSHA2 = HasSHA2();
std::cout << "passed: ";
std::cout << "hasNEON == " << hasNEON << ", hasPMULL == " << hasPMULL << ", hasCRC32 == " << hasCRC32 << ", hasAES == " << hasAES << ", hasSHA1 == " << hasSHA1 << ", hasSHA2 == " << hasSHA2 << std::endl;
std::cout << "hasNEON == " << hasNEON << ", hasCRC32 == " << hasCRC32 << ", hasPMULL == " << hasPMULL;
std::cout << ", hasAES == " << hasAES << ", hasSHA1 == " << hasSHA1 << ", hasSHA2 == " << hasSHA2 << std::endl;
#endif
if (!pass)

View File

@ -409,7 +409,7 @@ static const word64 Whirlpool_C[4*256+R] = {
void Whirlpool::Transform(word64 *digest, const word64 *block)
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasISSE())
if (HasSSE2())
{
// MMX version has the same structure as C version below
#ifdef __GNUC__