Remove ARIA SIMD code (GH #1235)

ARIA SIMD code existed to perform an XOR and the end of encryption and decryption. It was a lot of work to save for the final XOR.
Worse, the final XOR seemed to be causing problems as described in GH #1235. Once we unrolled the XOR and used them when building outBlock, the 1235 issue went away.
This commit is contained in:
Jeffrey Walton 2023-09-30 03:11:15 -04:00
parent 8d3e357c9c
commit 5250ab2bf2
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
11 changed files with 28 additions and 277 deletions

View File

@ -17,7 +17,6 @@ arc4.cpp
arc4.h
ariatab.cpp
aria.cpp
aria_simd.cpp
aria.h
argnames.h
arm_simd.h

View File

@ -1635,10 +1635,6 @@ NOSTD_CXXFLAGS=$(filter-out -stdlib=%,$(filter-out -std=%,$(CXXFLAGS)))
aes_armv4.o : aes_armv4.S
$(CXX) $(strip $(CPPFLAGS) $(ASFLAGS) $(NOSTD_CXXFLAGS) $(CRYPTOGAMS_ARM_THUMB_FLAG) -c) $<
# SSSE3 or NEON available
aria_simd.o : aria_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(ARIA_FLAG) -c) $<
# SSE, NEON or POWER7 available
blake2s_simd.o : blake2s_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(BLAKE2S_FLAG) -c) $<

View File

@ -975,10 +975,6 @@ aes_armv4.o : aes_armv4.S
cpu-features.o: cpu-features.h cpu-features.c
$(CXX) -x c $(strip $(CPPFLAGS) $(NOSTD_CXXFLAGS) -c) cpu-features.c
# SSSE3 or NEON available
aria_simd.o : aria_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(ARIA_FLAG) -c) $<
# SSE, NEON or POWER7 available
blake2s_simd.o : blake2s_simd.cpp
$(CXX) $(strip $(CPPFLAGS) $(CXXFLAGS) $(BLAKE2S_FLAG) -c) $<

View File

@ -120,7 +120,7 @@ do
# In the past we looked for the vector loads, stores and shifts using vld and friends.
# It looks like objdump changed its output format on Android after Clang, so we need
# to check for statements like eor v0.16b, v2.16b, v0.16b nowadays.
count=$(${OBJDUMP} --disassemble aria_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor|v0\.|v1\.|v2\.|v3\.|v4\.|v5\.|v6\.|v7\.')
count=$(${OBJDUMP} --disassemble chacha_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor|v0\.|v1\.|v2\.|v3\.|v4\.|v5\.|v6\.|v7\.')
if [[ "${count}" -gt 64 ]]
then
echo "${platform} : NEON ==> SUCCESS" >> "${TMPDIR}/build.log"
@ -136,7 +136,7 @@ do
# In the past we looked for the vector loads, stores and shifts using vld and friends.
# It looks like objdump changed its output format on Android after Clang, so we need
# to check for statements like eor v0.16b, v2.16b, v0.16b nowadays.
count=$(${OBJDUMP} --disassemble aria_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor|v0\.|v1\.|v2\.|v3\.|v4\.|v5\.|v6\.|v7\.')
count=$(${OBJDUMP} --disassemble chacha_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor|v0\.|v1\.|v2\.|v3\.|v4\.|v5\.|v6\.|v7\.')
if [[ "${count}" -gt 64 ]]
then
echo "${platform} : ASIMD ==> SUCCESS" >> "${TMPDIR}/build.log"

View File

@ -109,7 +109,7 @@ do
then
# Test NEON code generation
count=$(otool -tV aria_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor')
count=$(otool -tV chacha_simd.o 2>&1 | grep -c -E 'vld|vst|vshl|vshr|veor')
if [[ "${count}" -gt 64 ]]
then
echo "${platform} : NEON ==> SUCCESS" >> "${TMPDIR}/build.log"
@ -122,7 +122,7 @@ do
then
# Test ASIMD code generation
count=$(otool -tV aria_simd.o 2>&1 | grep -c -E 'ldr[[:space:]]*q|str[[:space:]]*q|shl.4|shr.4|eor.16')
count=$(otool -tV chacha_simd.o 2>&1 | grep -c -E 'ldr[[:space:]]*q|str[[:space:]]*q|shl.4|shr.4|eor.16')
if [[ "${count}" -gt 64 ]]
then
echo "${platform} : ASIMD ==> SUCCESS" >> "${TMPDIR}/build.log"

View File

@ -1645,7 +1645,7 @@ if [[ ("$HAVE_DISASS" -ne 0 && ("$IS_ARM32" -ne 0 || "$IS_ARM64" -ne 0)) ]]; the
TEST_LIST+=("ARM NEON code generation")
OBJFILE=aria_simd.o; rm -f "$OBJFILE" 2>/dev/null
OBJFILE=chacha_simd.o; rm -f "$OBJFILE" 2>/dev/null
CXX="${CXX}" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
COUNT=0

View File

@ -7,14 +7,6 @@
#include "misc.h"
#include "cpu.h"
#if CRYPTOPP_SSE2_INTRIN_AVAILABLE
# define CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS 1
#endif
#if CRYPTOPP_SSSE3_AVAILABLE
# define CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS 1
#endif
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(ARIATab)
@ -97,15 +89,6 @@ inline void ARIA_FE(word32 t[4]) {
ARIA_MM(t[0],t[1],t[2],t[3]);
}
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
extern void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen);
extern void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outblock, const byte *rk, word32 *t);
#endif
#if (CRYPTOPP_SSSE3_AVAILABLE)
extern void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t);
#endif
// n-bit right shift of Y XORed to X
template <unsigned int N>
inline void ARIA_GSRK(const word32 X[4], const word32 Y[4], byte RK[16])
@ -190,38 +173,29 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3];
#if CRYPTOPP_ARM_NEON_AVAILABLE
if (HasNEON())
{
ARIA_UncheckedSetKey_Schedule_NEON(rk, m_w, keylen);
}
else
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
{
ARIA_GSRK<19>(w0, w1, rk + 0);
ARIA_GSRK<19>(w1, w2, rk + 16);
ARIA_GSRK<19>(w2, w3, rk + 32);
ARIA_GSRK<19>(w3, w0, rk + 48);
ARIA_GSRK<31>(w0, w1, rk + 64);
ARIA_GSRK<31>(w1, w2, rk + 80);
ARIA_GSRK<31>(w2, w3, rk + 96);
ARIA_GSRK<31>(w3, w0, rk + 112);
ARIA_GSRK<67>(w0, w1, rk + 128);
ARIA_GSRK<67>(w1, w2, rk + 144);
ARIA_GSRK<67>(w2, w3, rk + 160);
ARIA_GSRK<67>(w3, w0, rk + 176);
ARIA_GSRK<97>(w0, w1, rk + 192);
ARIA_GSRK<19>(w0, w1, rk + 0);
ARIA_GSRK<19>(w1, w2, rk + 16);
ARIA_GSRK<19>(w2, w3, rk + 32);
ARIA_GSRK<19>(w3, w0, rk + 48);
ARIA_GSRK<31>(w0, w1, rk + 64);
ARIA_GSRK<31>(w1, w2, rk + 80);
ARIA_GSRK<31>(w2, w3, rk + 96);
ARIA_GSRK<31>(w3, w0, rk + 112);
ARIA_GSRK<67>(w0, w1, rk + 128);
ARIA_GSRK<67>(w1, w2, rk + 144);
ARIA_GSRK<67>(w2, w3, rk + 160);
ARIA_GSRK<67>(w3, w0, rk + 176);
ARIA_GSRK<97>(w0, w1, rk + 192);
if (keylen > 16)
if (keylen > 16)
{
ARIA_GSRK<97>(w1, w2, rk + 208);
ARIA_GSRK<97>(w2, w3, rk + 224);
if (keylen > 24)
{
ARIA_GSRK<97>(w1, w2, rk + 208);
ARIA_GSRK<97>(w2, w3, rk + 224);
if (keylen > 24)
{
ARIA_GSRK< 97>(w3, w0, rk + 240);
ARIA_GSRK<109>(w0, w1, rk + 256);
}
ARIA_GSRK< 97>(w3, w0, rk + 240);
ARIA_GSRK<109>(w0, w1, rk + 256);
}
}
@ -293,22 +267,6 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b
rk = ARIA_KXL(rk, t); ARIA_FO(t); rk = ARIA_KXL(rk, t); ARIA_FE(t);
rk = ARIA_KXL(rk, t); ARIA_FO(t); rk = ARIA_KXL(rk, t);
#if CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS
if (HasSSSE3())
{
ARIA_ProcessAndXorBlock_SSSE3(xorBlock, outBlock, rk, t);
return;
}
else
#endif // CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
{
ARIA_ProcessAndXorBlock_NEON(xorBlock, outBlock, rk, t);
return;
}
else
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_LITTLE_ENDIAN)
{
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3];

View File

@ -1,194 +0,0 @@
// aria_simd.cpp - written and placed in the public domain by
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
//
// This source file uses intrinsics to gain access to ARMv7a and
// ARMv8a NEON instructions. A separate source file is needed
// because additional CXXFLAGS are required to enable the
// appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "misc.h"
#if (CRYPTOPP_SSSE3_AVAILABLE)
# include <tmmintrin.h>
#endif
#if (CRYPTOPP_ARM_NEON_HEADER)
# include <arm_neon.h>
#endif
#if (CRYPTOPP_ARM_ACLE_HEADER)
# include <stdint.h>
# include <arm_acle.h>
#endif
// Squash MS LNK4221 and libtool warnings
extern const char ARIA_SIMD_FNAME[] = __FILE__;
NAMESPACE_BEGIN(CryptoPP)
NAMESPACE_BEGIN(ARIATab)
extern const word32 S1[256];
extern const word32 S2[256];
extern const word32 X1[256];
extern const word32 X2[256];
extern const word32 KRK[3][4];
NAMESPACE_END
NAMESPACE_END
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::byte;
using CryptoPP::word32;
inline byte ARIA_BRF(const word32 x, const int y) {
return static_cast<byte>(GETBYTE(x, y));
}
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
using CryptoPP::ARIATab::S1;
using CryptoPP::ARIATab::S2;
using CryptoPP::ARIATab::X1;
using CryptoPP::ARIATab::X2;
using CryptoPP::ARIATab::KRK;
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
template <unsigned int N>
inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16])
{
enum { Q1 = (4-(N/32)) % 4,
Q2 = (3-(N/32)) % 4,
R = N % 32
};
vst1q_u8(RK, vreinterpretq_u8_u32(
veorq_u32(X, veorq_u32(
vshrq_n_u32(vextq_u32(Y, Y, Q1), R),
vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R)))));
}
void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen)
{
const uint32x4_t w0 = vld1q_u32(ws+ 0);
const uint32x4_t w1 = vld1q_u32(ws+ 8);
const uint32x4_t w2 = vld1q_u32(ws+12);
const uint32x4_t w3 = vld1q_u32(ws+16);
ARIA_GSRK_NEON<19>(w0, w1, rk + 0);
ARIA_GSRK_NEON<19>(w1, w2, rk + 16);
ARIA_GSRK_NEON<19>(w2, w3, rk + 32);
ARIA_GSRK_NEON<19>(w3, w0, rk + 48);
ARIA_GSRK_NEON<31>(w0, w1, rk + 64);
ARIA_GSRK_NEON<31>(w1, w2, rk + 80);
ARIA_GSRK_NEON<31>(w2, w3, rk + 96);
ARIA_GSRK_NEON<31>(w3, w0, rk + 112);
ARIA_GSRK_NEON<67>(w0, w1, rk + 128);
ARIA_GSRK_NEON<67>(w1, w2, rk + 144);
ARIA_GSRK_NEON<67>(w2, w3, rk + 160);
ARIA_GSRK_NEON<67>(w3, w0, rk + 176);
ARIA_GSRK_NEON<97>(w0, w1, rk + 192);
if (keylen > 16)
{
ARIA_GSRK_NEON<97>(w1, w2, rk + 208);
ARIA_GSRK_NEON<97>(w2, w3, rk + 224);
if (keylen > 24)
{
ARIA_GSRK_NEON< 97>(w3, w0, rk + 240);
ARIA_GSRK_NEON<109>(w0, w1, rk + 256);
}
}
}
void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
{
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
// 'outBlock' and 'xorBlock' may be unaligned.
if (xorBlock != NULLPTR)
{
vst1q_u8(outBlock,
veorq_u8(
vld1q_u8(xorBlock),
veorq_u8(
vld1q_u8(outBlock),
vrev32q_u8(vld1q_u8((rk))))));
}
else
{
vst1q_u8(outBlock,
veorq_u8(
vld1q_u8(outBlock),
vrev32q_u8(vld1q_u8(rk))));
}
}
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_SSSE3_AVAILABLE)
void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
{
const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] );
outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] );
outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] );
outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] );
outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] );
outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] );
outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] );
outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] );
outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] );
outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] );
outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] );
outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] );
// 'outBlock' and 'xorBlock' may be unaligned.
if (xorBlock != NULLPTR)
{
_mm_storeu_si128(M128_CAST(outBlock),
_mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(xorBlock)),
_mm_xor_si128(
_mm_loadu_si128(CONST_M128_CAST(outBlock)),
_mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)))
);
}
else
{
_mm_storeu_si128(M128_CAST(outBlock),
_mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(outBlock)),
_mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)));
}
}
#endif // CRYPTOPP_SSSE3_AVAILABLE
NAMESPACE_END

View File

@ -57,7 +57,7 @@
LIB_SRCS = \
cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp \
algparam.cpp allocate.cpp arc4.cpp aria.cpp aria_simd.cpp ariatab.cpp \
algparam.cpp allocate.cpp arc4.cpp aria.cpp ariatab.cpp \
asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp \
blake2.cpp blake2b_simd.cpp blake2s_simd.cpp blowfish.cpp blumshub.cpp \
camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp \
@ -88,7 +88,7 @@ LIB_SRCS = \
LIB_OBJS = \
cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj \
algparam.obj allocate.obj arc4.obj aria.obj aria_simd.obj ariatab.obj \
algparam.obj allocate.obj arc4.obj aria.obj ariatab.obj \
asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj \
blake2.obj blake2b_simd.obj blake2s_simd.obj blowfish.obj blumshub.obj \
camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj \

View File

@ -174,7 +174,6 @@
<ClCompile Include="allocate.cpp" />
<ClCompile Include="arc4.cpp" />
<ClCompile Include="aria.cpp" />
<ClCompile Include="aria_simd.cpp" />
<ClCompile Include="ariatab.cpp" />
<ClCompile Include="asn.cpp" />
<ClCompile Include="authenc.cpp" />

View File

@ -35,9 +35,6 @@
<ClCompile Include="aria.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="aria_simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="ariatab.cpp">
<Filter>Source Files</Filter>
</ClCompile>