From 5250ab2bf2b0e140cd3374db73319a87797905d0 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sat, 30 Sep 2023 03:11:15 -0400 Subject: [PATCH] Remove ARIA SIMD code (GH #1235) ARIA SIMD code existed to perform an XOR and the end of encryption and decryption. It was a lot of work to save for the final XOR. Worse, the final XOR seemed to be causing problems as described in GH #1235. Once we unrolled the XOR and used them when building outBlock, the 1235 issue went away. --- Filelist.txt | 1 - GNUmakefile | 4 - GNUmakefile-cross | 4 - TestScripts/cryptest-android.sh | 4 +- TestScripts/cryptest-ios.sh | 4 +- TestScripts/cryptest.sh | 2 +- aria.cpp | 84 ++++---------- aria_simd.cpp | 194 -------------------------------- cryptest.nmake | 4 +- cryptlib.vcxproj | 1 - cryptlib.vcxproj.filters | 3 - 11 files changed, 28 insertions(+), 277 deletions(-) delete mode 100644 aria_simd.cpp diff --git a/Filelist.txt b/Filelist.txt index 410613a8..8cd703c1 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -17,7 +17,6 @@ arc4.cpp arc4.h ariatab.cpp aria.cpp -aria_simd.cpp aria.h argnames.h arm_simd.h diff --git a/GNUmakefile b/GNUmakefile index 84ac9353..fbc69a2a 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -1635,10 +1635,6 @@ NOSTD_CXXFLAGS=$(filter-out -stdlib=%,$(filter-out -std=%,$(CXXFLAGS))) aes_armv4.o : aes_armv4.S $(CXX) $(strip $(CPPFLAGS) $(ASFLAGS) $(NOSTD_CXXFLAGS) $(CRYPTOGAMS_ARM_THUMB_FLAG) -c) $< -# SSSE3 or NEON available -aria_simd.o : aria_simd.cpp - $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(ARIA_FLAG) -c) $< - # SSE, NEON or POWER7 available blake2s_simd.o : blake2s_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(BLAKE2S_FLAG) -c) $< diff --git a/GNUmakefile-cross b/GNUmakefile-cross index 669dc52c..13d650dd 100644 --- a/GNUmakefile-cross +++ b/GNUmakefile-cross @@ -975,10 +975,6 @@ aes_armv4.o : aes_armv4.S cpu-features.o: cpu-features.h cpu-features.c $(CXX) -x c $(strip $(CPPFLAGS) $(NOSTD_CXXFLAGS) -c) cpu-features.c -# SSSE3 or NEON available -aria_simd.o : aria_simd.cpp - $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(ARIA_FLAG) -c) $< - # SSE, NEON or POWER7 available blake2s_simd.o : blake2s_simd.cpp $(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(BLAKE2S_FLAG) -c) $< diff --git a/TestScripts/cryptest-android.sh b/TestScripts/cryptest-android.sh index ace9c4ba..6f8b976f 100755 --- a/TestScripts/cryptest-android.sh +++ b/TestScripts/cryptest-android.sh @@ -120,7 +120,7 @@ do # In the past we looked for the vector loads, stores and shifts using vld and friends. # It looks like objdump changed its output format on Android after Clang, so we need # to check for statements like eor v0.16b, v2.16b, v0.16b nowadays. - count=$(${OBJDUMP} --disassemble aria_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor|v0\.|v1\.|v2\.|v3\.|v4\.|v5\.|v6\.|v7\.') + count=$(${OBJDUMP} --disassemble chacha_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor|v0\.|v1\.|v2\.|v3\.|v4\.|v5\.|v6\.|v7\.') if [[ "${count}" -gt 64 ]] then echo "${platform} : NEON ==> SUCCESS" >> "${TMPDIR}/build.log" @@ -136,7 +136,7 @@ do # In the past we looked for the vector loads, stores and shifts using vld and friends. # It looks like objdump changed its output format on Android after Clang, so we need # to check for statements like eor v0.16b, v2.16b, v0.16b nowadays. - count=$(${OBJDUMP} --disassemble aria_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor|v0\.|v1\.|v2\.|v3\.|v4\.|v5\.|v6\.|v7\.') + count=$(${OBJDUMP} --disassemble chacha_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor|v0\.|v1\.|v2\.|v3\.|v4\.|v5\.|v6\.|v7\.') if [[ "${count}" -gt 64 ]] then echo "${platform} : ASIMD ==> SUCCESS" >> "${TMPDIR}/build.log" diff --git a/TestScripts/cryptest-ios.sh b/TestScripts/cryptest-ios.sh index 3076111d..9299b84e 100755 --- a/TestScripts/cryptest-ios.sh +++ b/TestScripts/cryptest-ios.sh @@ -109,7 +109,7 @@ do then # Test NEON code generation - count=$(otool -tV aria_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor') + count=$(otool -tV chacha_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor') if [[ "${count}" -gt 64 ]] then echo "${platform} : NEON ==> SUCCESS" >> "${TMPDIR}/build.log" @@ -122,7 +122,7 @@ do then # Test ASIMD code generation - count=$(otool -tV aria_simd.o 2>&1 | grep -c -E 'ldr[[:space:]]*q|str[[:space:]]*q|shl.4|shr.4|eor.16') + count=$(otool -tV chacha_simd.o 2>&1 | grep -c -E 'ldr[[:space:]]*q|str[[:space:]]*q|shl.4|shr.4|eor.16') if [[ "${count}" -gt 64 ]] then echo "${platform} : ASIMD ==> SUCCESS" >> "${TMPDIR}/build.log" diff --git a/TestScripts/cryptest.sh b/TestScripts/cryptest.sh index 8b40f57b..8e364afa 100755 --- a/TestScripts/cryptest.sh +++ b/TestScripts/cryptest.sh @@ -1645,7 +1645,7 @@ if [[ ("$HAVE_DISASS" -ne 0 && ("$IS_ARM32" -ne 0 || "$IS_ARM64" -ne 0)) ]]; the TEST_LIST+=("ARM NEON code generation") - OBJFILE=aria_simd.o; rm -f "$OBJFILE" 2>/dev/null + OBJFILE=chacha_simd.o; rm -f "$OBJFILE" 2>/dev/null CXX="${CXX}" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 diff --git a/aria.cpp b/aria.cpp index a5b17433..d5a84fc9 100644 --- a/aria.cpp +++ b/aria.cpp @@ -7,14 +7,6 @@ #include "misc.h" #include "cpu.h" -#if CRYPTOPP_SSE2_INTRIN_AVAILABLE -# define CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS 1 -#endif - -#if CRYPTOPP_SSSE3_AVAILABLE -# define CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS 1 -#endif - NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(ARIATab) @@ -97,15 +89,6 @@ inline void ARIA_FE(word32 t[4]) { ARIA_MM(t[0],t[1],t[2],t[3]); } -#if (CRYPTOPP_ARM_NEON_AVAILABLE) -extern void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen); -extern void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outblock, const byte *rk, word32 *t); -#endif - -#if (CRYPTOPP_SSSE3_AVAILABLE) -extern void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t); -#endif - // n-bit right shift of Y XORed to X template inline void ARIA_GSRK(const word32 X[4], const word32 Y[4], byte RK[16]) @@ -190,38 +173,29 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3]; -#if CRYPTOPP_ARM_NEON_AVAILABLE - if (HasNEON()) - { - ARIA_UncheckedSetKey_Schedule_NEON(rk, m_w, keylen); - } - else -#endif // CRYPTOPP_ARM_NEON_AVAILABLE - { - ARIA_GSRK<19>(w0, w1, rk + 0); - ARIA_GSRK<19>(w1, w2, rk + 16); - ARIA_GSRK<19>(w2, w3, rk + 32); - ARIA_GSRK<19>(w3, w0, rk + 48); - ARIA_GSRK<31>(w0, w1, rk + 64); - ARIA_GSRK<31>(w1, w2, rk + 80); - ARIA_GSRK<31>(w2, w3, rk + 96); - ARIA_GSRK<31>(w3, w0, rk + 112); - ARIA_GSRK<67>(w0, w1, rk + 128); - ARIA_GSRK<67>(w1, w2, rk + 144); - ARIA_GSRK<67>(w2, w3, rk + 160); - ARIA_GSRK<67>(w3, w0, rk + 176); - ARIA_GSRK<97>(w0, w1, rk + 192); + ARIA_GSRK<19>(w0, w1, rk + 0); + ARIA_GSRK<19>(w1, w2, rk + 16); + ARIA_GSRK<19>(w2, w3, rk + 32); + ARIA_GSRK<19>(w3, w0, rk + 48); + ARIA_GSRK<31>(w0, w1, rk + 64); + ARIA_GSRK<31>(w1, w2, rk + 80); + ARIA_GSRK<31>(w2, w3, rk + 96); + ARIA_GSRK<31>(w3, w0, rk + 112); + ARIA_GSRK<67>(w0, w1, rk + 128); + ARIA_GSRK<67>(w1, w2, rk + 144); + ARIA_GSRK<67>(w2, w3, rk + 160); + ARIA_GSRK<67>(w3, w0, rk + 176); + ARIA_GSRK<97>(w0, w1, rk + 192); - if (keylen > 16) + if (keylen > 16) + { + ARIA_GSRK<97>(w1, w2, rk + 208); + ARIA_GSRK<97>(w2, w3, rk + 224); + + if (keylen > 24) { - ARIA_GSRK<97>(w1, w2, rk + 208); - ARIA_GSRK<97>(w2, w3, rk + 224); - - if (keylen > 24) - { - ARIA_GSRK< 97>(w3, w0, rk + 240); - ARIA_GSRK<109>(w0, w1, rk + 256); - } + ARIA_GSRK< 97>(w3, w0, rk + 240); + ARIA_GSRK<109>(w0, w1, rk + 256); } } @@ -293,22 +267,6 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b rk = ARIA_KXL(rk, t); ARIA_FO(t); rk = ARIA_KXL(rk, t); ARIA_FE(t); rk = ARIA_KXL(rk, t); ARIA_FO(t); rk = ARIA_KXL(rk, t); -#if CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS - if (HasSSSE3()) - { - ARIA_ProcessAndXorBlock_SSSE3(xorBlock, outBlock, rk, t); - return; - } - else -#endif // CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS -#if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - { - ARIA_ProcessAndXorBlock_NEON(xorBlock, outBlock, rk, t); - return; - } - else -#endif // CRYPTOPP_ARM_NEON_AVAILABLE #if (CRYPTOPP_LITTLE_ENDIAN) { outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3]; diff --git a/aria_simd.cpp b/aria_simd.cpp deleted file mode 100644 index 56265e0e..00000000 --- a/aria_simd.cpp +++ /dev/null @@ -1,194 +0,0 @@ -// aria_simd.cpp - written and placed in the public domain by -// Jeffrey Walton, Uri Blumenthal and Marcel Raad. -// -// This source file uses intrinsics to gain access to ARMv7a and -// ARMv8a NEON instructions. A separate source file is needed -// because additional CXXFLAGS are required to enable the -// appropriate instructions sets in some build configurations. - -#include "pch.h" -#include "config.h" -#include "misc.h" - -#if (CRYPTOPP_SSSE3_AVAILABLE) -# include -#endif - -#if (CRYPTOPP_ARM_NEON_HEADER) -# include -#endif - -#if (CRYPTOPP_ARM_ACLE_HEADER) -# include -# include -#endif - -// Squash MS LNK4221 and libtool warnings -extern const char ARIA_SIMD_FNAME[] = __FILE__; - -NAMESPACE_BEGIN(CryptoPP) -NAMESPACE_BEGIN(ARIATab) - -extern const word32 S1[256]; -extern const word32 S2[256]; -extern const word32 X1[256]; -extern const word32 X2[256]; -extern const word32 KRK[3][4]; - -NAMESPACE_END -NAMESPACE_END - -ANONYMOUS_NAMESPACE_BEGIN - -using CryptoPP::byte; -using CryptoPP::word32; - -inline byte ARIA_BRF(const word32 x, const int y) { - return static_cast(GETBYTE(x, y)); -} - -ANONYMOUS_NAMESPACE_END - -NAMESPACE_BEGIN(CryptoPP) - -using CryptoPP::ARIATab::S1; -using CryptoPP::ARIATab::S2; -using CryptoPP::ARIATab::X1; -using CryptoPP::ARIATab::X2; -using CryptoPP::ARIATab::KRK; - -#if (CRYPTOPP_ARM_NEON_AVAILABLE) - -template -inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16]) -{ - enum { Q1 = (4-(N/32)) % 4, - Q2 = (3-(N/32)) % 4, - R = N % 32 - }; - - vst1q_u8(RK, vreinterpretq_u8_u32( - veorq_u32(X, veorq_u32( - vshrq_n_u32(vextq_u32(Y, Y, Q1), R), - vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R))))); -} - -void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen) -{ - const uint32x4_t w0 = vld1q_u32(ws+ 0); - const uint32x4_t w1 = vld1q_u32(ws+ 8); - const uint32x4_t w2 = vld1q_u32(ws+12); - const uint32x4_t w3 = vld1q_u32(ws+16); - - ARIA_GSRK_NEON<19>(w0, w1, rk + 0); - ARIA_GSRK_NEON<19>(w1, w2, rk + 16); - ARIA_GSRK_NEON<19>(w2, w3, rk + 32); - ARIA_GSRK_NEON<19>(w3, w0, rk + 48); - ARIA_GSRK_NEON<31>(w0, w1, rk + 64); - ARIA_GSRK_NEON<31>(w1, w2, rk + 80); - ARIA_GSRK_NEON<31>(w2, w3, rk + 96); - ARIA_GSRK_NEON<31>(w3, w0, rk + 112); - ARIA_GSRK_NEON<67>(w0, w1, rk + 128); - ARIA_GSRK_NEON<67>(w1, w2, rk + 144); - ARIA_GSRK_NEON<67>(w2, w3, rk + 160); - ARIA_GSRK_NEON<67>(w3, w0, rk + 176); - ARIA_GSRK_NEON<97>(w0, w1, rk + 192); - - if (keylen > 16) - { - ARIA_GSRK_NEON<97>(w1, w2, rk + 208); - ARIA_GSRK_NEON<97>(w2, w3, rk + 224); - - if (keylen > 24) - { - ARIA_GSRK_NEON< 97>(w3, w0, rk + 240); - ARIA_GSRK_NEON<109>(w0, w1, rk + 256); - } - } -} - -void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t) -{ - outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ); - outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8); - outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); - outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); - outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); - outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); - outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); - outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); - outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); - outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); - outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); - outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); - outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); - outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); - outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); - outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); - - // 'outBlock' and 'xorBlock' may be unaligned. - if (xorBlock != NULLPTR) - { - vst1q_u8(outBlock, - veorq_u8( - vld1q_u8(xorBlock), - veorq_u8( - vld1q_u8(outBlock), - vrev32q_u8(vld1q_u8((rk)))))); - } - else - { - vst1q_u8(outBlock, - veorq_u8( - vld1q_u8(outBlock), - vrev32q_u8(vld1q_u8(rk)))); - } -} - -#endif // CRYPTOPP_ARM_NEON_AVAILABLE - -#if (CRYPTOPP_SSSE3_AVAILABLE) - -void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t) -{ - const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); - - outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ); - outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8); - outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); - outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); - outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); - outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); - outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); - outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); - outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); - outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); - outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); - outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); - outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); - outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); - outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); - outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); - - // 'outBlock' and 'xorBlock' may be unaligned. - if (xorBlock != NULLPTR) - { - _mm_storeu_si128(M128_CAST(outBlock), - _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(xorBlock)), - _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(outBlock)), - _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK))) - ); - } - else - { - _mm_storeu_si128(M128_CAST(outBlock), - _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(outBlock)), - _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK))); - } -} - -#endif // CRYPTOPP_SSSE3_AVAILABLE - -NAMESPACE_END diff --git a/cryptest.nmake b/cryptest.nmake index b321bf76..d04a2479 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -57,7 +57,7 @@ LIB_SRCS = \ cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp \ - algparam.cpp allocate.cpp arc4.cpp aria.cpp aria_simd.cpp ariatab.cpp \ + algparam.cpp allocate.cpp arc4.cpp aria.cpp ariatab.cpp \ asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp \ blake2.cpp blake2b_simd.cpp blake2s_simd.cpp blowfish.cpp blumshub.cpp \ camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp \ @@ -88,7 +88,7 @@ LIB_SRCS = \ LIB_OBJS = \ cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj \ - algparam.obj allocate.obj arc4.obj aria.obj aria_simd.obj ariatab.obj \ + algparam.obj allocate.obj arc4.obj aria.obj ariatab.obj \ asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj \ blake2.obj blake2b_simd.obj blake2s_simd.obj blowfish.obj blumshub.obj \ camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj \ diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index e219cff3..f99661d0 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -174,7 +174,6 @@ - diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index 46007144..ef6f3c53 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -35,9 +35,6 @@ Source Files - - Source Files - Source Files