From 2876371cea69139379dea4ec54f3dbd6f6d8699e Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Thu, 16 Aug 2018 18:24:22 -0400 Subject: [PATCH] Add GNUmakefile-cross flags SIMON and SPECK --- GNUmakefile-cross | 46 ++++++++++++++++++++++++++++++---------------- blake2-simd.cpp | 4 ++++ 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/GNUmakefile-cross b/GNUmakefile-cross index 7692d443..b688f192 100755 --- a/GNUmakefile-cross +++ b/GNUmakefile-cross @@ -264,8 +264,10 @@ ifeq ($(IS_NEON),1) BLAKE2_FLAG += -mfpu=neon LEA_FLAG += -mfpu=neon SIMECK_FLAG += -mfpu=neon - SIMON_FLAG += -mfpu=neon - SPECK_FLAG += -mfpu=neon + SIMON64_FLAG += -mfpu=neon + SIMON128_FLAG += -mfpu=neon + SPECK64_FLAG += -mfpu=neon + SPECK128_FLAG += -mfpu=neon ifeq ($(IS_ANDROID),1) ifeq ($(findstring -mfloat-abi=softfp,$(CXXFLAGS)),) NEON_FLAG += -mfloat-abi=softfp @@ -274,8 +276,10 @@ ifeq ($(IS_NEON),1) BLAKE2_FLAG += -mfloat-abi=softfp LEA_FLAG += -mfloat-abi=softfp SIMECK_FLAG += -mfloat-abi=softfp - SIMON_FLAG += -mfloat-abi=softfp - SPECK_FLAG += -mfloat-abi=softfp + SIMON64_FLAG += -mfloat-abi=softfp + SIMON128_FLAG += -mfloat-abi=softfp + SPECK64_FLAG += -mfloat-abi=softfp + SPECK128_FLAG += -mfloat-abi=softfp endif endif endif @@ -290,8 +294,10 @@ ifneq ($(IS_ARMv8),0) LEA_FLAG = -march=armv8-a NEON_FLAG = -march=armv8-a SIMECK_FLAG = -march=armv8-a - SIMON_FLAG = -march=armv8-a - SPECK_FLAG = -march=armv8-a + SIMON64_FLAG = -march=armv8-a + SIMON128_FLAG = -march=armv8-a + SPECK64_FLAG = -march=armv8-a + SPECK128_FLAG = -march=armv8-a endif HAVE_CRC := $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -march=armv8-a+crc -dM -E adhoc.cpp 2>&1 | $(EGREP) -i -c __ARM_FEATURE_CRC32) ifeq ($(HAVE_CRC),1) @@ -318,13 +324,13 @@ ifneq ($(IS_i686)$(IS_x86_64),00) LEA_FLAG = -mssse3 SSSE3_FLAG = -mssse3 SIMECK_FLAG = -mssse3 - SIMON_FLAG = -mssse3 - SPECK_FLAG = -mssse3 + SIMON128_FLAG = -mssse3 + SPECK128_FLAG = -mssse3 endif HAVE_SSE4 = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -msse4.1 -dM -E adhoc.cpp 2>&1 | $(EGREP) -i -c __SSE4_1__) ifeq ($(HAVE_SSE4),1) - SIMON_FLAG = -msse4.1 - SPECK_FLAG = -msse4.1 + SIMON64_FLAG = -msse4.1 + SPECK64_FLAG = -msse4.1 endif HAVE_SSE4 = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -msse4.2 -dM -E adhoc.cpp 2>&1 | $(EGREP) -i -c __SSE4_2__) ifeq ($(HAVE_SSE4),1) @@ -604,13 +610,21 @@ shacal2-simd.o : shacal2-simd.cpp simeck-simd.o : simeck-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(SIMECK_FLAG) -c) $< -# SSSE3 or NEON available -simon-simd.o : simon-simd.cpp - $(CXX) $(strip $(CXXFLAGS) $(SIMON_FLAG) -c) $< +# SSE4.1, NEON or POWER7 available +simon64-simd.o : simon64-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SIMON64_FLAG) -c) $< -# SSSE3 or NEON available -speck-simd.o : speck-simd.cpp - $(CXX) $(strip $(CXXFLAGS) $(SPECK_FLAG) -c) $< +# SSSE3, NEON or POWER8 available +simon128-simd.o : simon128-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SIMON128_FLAG) -c) $< + +# SSE4.1, NEON or POWER7 available +speck64-simd.o : speck64-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SPECK64_FLAG) -c) $< + +# SSSE3, NEON or POWER8 available +speck128-simd.o : speck128-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SPECK128_FLAG) -c) $< # AESNI available sm4-simd.o : sm4-simd.cpp diff --git a/blake2-simd.cpp b/blake2-simd.cpp index 1aea7bdc..6646b166 100644 --- a/blake2-simd.cpp +++ b/blake2-simd.cpp @@ -342,6 +342,7 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& stat row2 = ff1 = LOADU( &state.h[4] ); row3 = LOADU( &BLAKE2S_IV[0] ); row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.t[0] ) ); + BLAKE2S_ROUND( 0 ); BLAKE2S_ROUND( 1 ); BLAKE2S_ROUND( 2 ); @@ -352,6 +353,7 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& stat BLAKE2S_ROUND( 7 ); BLAKE2S_ROUND( 8 ); BLAKE2S_ROUND( 9 ); + STOREU( &state.h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); STOREU( &state.h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); } @@ -752,6 +754,7 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state row3h = LOADU( &BLAKE2B_IV[2] ); row4l = _mm_xor_si128( LOADU( &BLAKE2B_IV[4] ), LOADU( &state.t[0] ) ); row4h = _mm_xor_si128( LOADU( &BLAKE2B_IV[6] ), LOADU( &state.f[0] ) ); + BLAKE2B_ROUND( 0 ); BLAKE2B_ROUND( 1 ); BLAKE2B_ROUND( 2 ); @@ -764,6 +767,7 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state BLAKE2B_ROUND( 9 ); BLAKE2B_ROUND( 10 ); BLAKE2B_ROUND( 11 ); + row1l = _mm_xor_si128( row3l, row1l ); row1h = _mm_xor_si128( row3h, row1h ); STOREU( &state.h[0], _mm_xor_si128( LOADU( &state.h[0] ), row1l ) );