Add BLAKE2b Power8 implementation (GH #729)

This commit is contained in:
Jeffrey Walton 2018-10-30 06:20:31 -04:00
parent 81db4ea5e3
commit 659c0c113c
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
4 changed files with 491 additions and 22 deletions

View File

@ -432,6 +432,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00)
ifneq ($(HAVE_POWER8),0)
POWER8_FLAG = -mcpu=power8 -maltivec
AES_FLAG = $(POWER8_FLAG)
BLAKE2_FLAG = $(POWER8_FLAG)
CHACHA_FLAG = $(POWER8_FLAG)
GCM_FLAG = $(POWER8_FLAG)
SHA_FLAG = $(POWER8_FLAG)
@ -447,7 +448,6 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00)
ifneq ($(HAVE_POWER7),0)
POWER7_FLAG = -mcpu=power7 -maltivec
ARIA_FLAG = $(POWER7_FLAG)
BLAKE2_FLAG = $(POWER7_FLAG)
CHAM_FLAG = $(POWER7_FLAG)
LEA_FLAG = $(POWER7_FLAG)
SIMECK_FLAG = $(POWER7_FLAG)
@ -466,6 +466,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00)
ifneq ($(HAVE_POWER8),0)
POWER8_FLAG = -qarch=pwr8 -qaltivec
AES_FLAG = $(POWER8_FLAG)
BLAKE2_FLAG = $(POWER8_FLAG)
CHACHA_FLAG = $(POWER8_FLAG)
GCM_FLAG = $(POWER8_FLAG)
SHA_FLAG = $(POWER8_FLAG)
@ -481,7 +482,6 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00)
ifneq ($(HAVE_POWER7),0)
POWER7_FLAG = -qarch=pwr7 -qaltivec
ARIA_FLAG = $(POWER7_FLAG)
BLAKE2_FLAG = $(POWER7_FLAG)
CHAM_FLAG = $(POWER7_FLAG)
LEA_FLAG = $(POWER7_FLAG)
SIMECK_FLAG = $(POWER7_FLAG)
@ -1274,7 +1274,7 @@ test.o : test.cpp
endif
endif
validat%.o : validat%.cpp
validat1.o : validat1.cpp
$(CXX) $(strip $(CXXFLAGS) $(ALTIVEC_FLAG) -c) $<
%.dllonly.o : %.cpp

View File

@ -16,6 +16,7 @@
// Do so in both blake2.cpp and blake2-simd.cpp.
// #undef CRYPTOPP_SSE41_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
// #undef CRYPTOPP_POWER8_AVAILABLE
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
@ -40,12 +41,16 @@
# include <arm_acle.h>
#endif
#if defined(CRYPTOPP_POWER8_AVAILABLE)
# include "ppc-simd.h"
#endif
ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::word32;
using CryptoPP::word64;
#if (CRYPTOPP_SSE41_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE)
#if (CRYPTOPP_SSE41_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_POWER8_AVAILABLE)
CRYPTOPP_ALIGN_DATA(16)
const word32 BLAKE2S_IV[8] = {
@ -1273,7 +1278,7 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state
#define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
#define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
do { \
row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
@ -1284,7 +1289,7 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state
row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \
} while(0)
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
#define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
do { \
row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
@ -1317,21 +1322,17 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state
do { \
uint64x2_t b0, b1; \
BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
} while(0)
CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf<uint64x2_t>()));
const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00));
const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16));
const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32));
@ -1374,4 +1375,461 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state
}
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_POWER8_AVAILABLE)
inline uint64x2_p VectorLoad64(const void* p)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
return (uint64x2_p)vec_xl(0, (uint8_t*)p);
#else
return (uint64x2_p)vec_vsx_ld(0, (uint8_t*)p);
#endif
}
inline uint64x2_p VectorLoad64LE(const void* p)
{
#if __BIG_ENDIAN__
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
const uint64x2_p v = VectorLoad64(p);
return vec_perm(v, v, m);
#else
return VectorLoad64(p);
#endif
}
inline void VectorStore64(void* p, const uint64x2_p x)
{
#if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
vec_xst((uint8x16_p)x,0,(uint8_t*)p);
#else
vec_vsx_st((uint8x16_p)x,0,(uint8_t*)p);
#endif
}
inline void VectorStore64LE(void* p, const uint64x2_p x)
{
#if __BIG_ENDIAN__
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
VectorStore64(p, vec_perm(x, x, m));
#else
VectorStore64(p, x);
#endif
}
template <unsigned int C>
inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
{
#if __BIG_ENDIAN__
return (uint64x2_p)vec_sld((uint8x16_p)a, (uint8x16_p)b, C);
#else
return (uint64x2_p)vec_sld((uint8x16_p)b, (uint8x16_p)a, 16-C);
#endif
}
#define vec_ext(a,b,c) VectorShiftLeftOctet<c*8>(a, b)
void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2_State<word64, true>& state)
{
// Permute masks
const uint8x16_p LL_MASK = { 0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23 };
const uint8x16_p LH_MASK = { 0,1,2,3,4,5,6,7, 24,25,26,27,28,29,30,31 };
const uint8x16_p HL_MASK = { 8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23 };
const uint8x16_p HH_MASK = { 8,9,10,11,12,13,14,15, 24,25,26,27,28,29,30,31 };
#define BLAKE2B_LOAD_MSG_0_1(b0, b1) \
do { \
b0 = vec_perm(m0, m1, LL_MASK); \
b1 = vec_perm(m2, m3, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_2(b0, b1) \
do { \
b0 = vec_perm(m0, m1, HH_MASK); \
b1 = vec_perm(m2, m3, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_3(b0, b1) \
do { \
b0 = vec_perm(m4, m5, LL_MASK); \
b1 = vec_perm(m6, m7, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_4(b0, b1) \
do { \
b0 = vec_perm(m4, m5, HH_MASK); \
b1 = vec_perm(m6, m7, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_1(b0, b1) \
do { \
b0 = vec_perm(m7, m2, LL_MASK); \
b1 = vec_perm(m4, m6, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_2(b0, b1) \
do { \
b0 = vec_perm(m5, m4, LL_MASK); \
b1 = vec_ext(m7, m3, 1); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_3(b0, b1) \
do { \
b0 = vec_ext(m0, m0, 1); \
b1 = vec_perm(m5, m2, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_4(b0, b1) \
do { \
b0 = vec_perm(m6, m1, LL_MASK); \
b1 = vec_perm(m3, m1, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_1(b0, b1) \
do { \
b0 = vec_ext(m5, m6, 1); \
b1 = vec_perm(m2, m7, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
do { \
b0 = vec_perm(m4, m0, LL_MASK); \
b1 = vec_perm(m1, m6, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
do { \
b0 = vec_perm(m5, m1, LH_MASK); \
b1 = vec_perm(m3, m4, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_4(b0, b1) \
do { \
b0 = vec_perm(m7, m3, LL_MASK); \
b1 = vec_ext(m0, m2, 1); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_1(b0, b1) \
do { \
b0 = vec_perm(m3, m1, HH_MASK); \
b1 = vec_perm(m6, m5, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_2(b0, b1) \
do { \
b0 = vec_perm(m4, m0, HH_MASK); \
b1 = vec_perm(m6, m7, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
do { \
b0 = vec_perm(m1, m2, LH_MASK); \
b1 = vec_perm(m2, m7, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
do { \
b0 = vec_perm(m3, m5, LL_MASK); \
b1 = vec_perm(m0, m4, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_1(b0, b1) \
do { \
b0 = vec_perm(m4, m2, HH_MASK); \
b1 = vec_perm(m1, m5, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
do { \
b0 = vec_perm(m0, m3, LH_MASK); \
b1 = vec_perm(m2, m7, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
do { \
b0 = vec_perm(m7, m5, LH_MASK); \
b1 = vec_perm(m3, m1, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
do { \
b0 = vec_ext(m0, m6, 1); \
b1 = vec_perm(m4, m6, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
do { \
b0 = vec_perm(m1, m3, LL_MASK); \
b1 = vec_perm(m0, m4, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_2(b0, b1) \
do { \
b0 = vec_perm(m6, m5, LL_MASK); \
b1 = vec_perm(m5, m1, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
do { \
b0 = vec_perm(m2, m3, LH_MASK); \
b1 = vec_perm(m7, m0, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
do { \
b0 = vec_perm(m6, m2, HH_MASK); \
b1 = vec_perm(m7, m4, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
do { \
b0 = vec_perm(m6, m0, LH_MASK); \
b1 = vec_perm(m7, m2, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_2(b0, b1) \
do { \
b0 = vec_perm(m2, m7, HH_MASK); \
b1 = vec_ext(m6, m5, 1); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_3(b0, b1) \
do { \
b0 = vec_perm(m0, m3, LL_MASK); \
b1 = vec_ext(m4, m4, 1); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
do { \
b0 = vec_perm(m3, m1, HH_MASK); \
b1 = vec_perm(m1, m5, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
do { \
b0 = vec_perm(m6, m3, HH_MASK); \
b1 = vec_perm(m6, m1, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
do { \
b0 = vec_ext(m5, m7, 1); \
b1 = vec_perm(m0, m4, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_3(b0, b1) \
do { \
b0 = vec_perm(m2, m7, HH_MASK); \
b1 = vec_perm(m4, m1, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_4(b0, b1) \
do { \
b0 = vec_perm(m0, m2, LL_MASK); \
b1 = vec_perm(m3, m5, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_1(b0, b1) \
do { \
b0 = vec_perm(m3, m7, LL_MASK); \
b1 = vec_ext(m5, m0, 1); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_2(b0, b1) \
do { \
b0 = vec_perm(m7, m4, HH_MASK); \
b1 = vec_ext(m1, m4, 1); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_3(b0, b1) \
do { \
b0 = m6; \
b1 = vec_ext(m0, m5, 1); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
do { \
b0 = vec_perm(m1, m3, LH_MASK); \
b1 = m2; \
} while(0)
#define BLAKE2B_LOAD_MSG_9_1(b0, b1) \
do { \
b0 = vec_perm(m5, m4, LL_MASK); \
b1 = vec_perm(m3, m0, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
do { \
b0 = vec_perm(m1, m2, LL_MASK); \
b1 = vec_perm(m3, m2, LH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
do { \
b0 = vec_perm(m7, m4, HH_MASK); \
b1 = vec_perm(m1, m6, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_4(b0, b1) \
do { \
b0 = vec_ext(m5, m7, 1); \
b1 = vec_perm(m6, m0, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_1(b0, b1) \
do { \
b0 = vec_perm(m0, m1, LL_MASK); \
b1 = vec_perm(m2, m3, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_2(b0, b1) \
do { \
b0 = vec_perm(m0, m1, HH_MASK); \
b1 = vec_perm(m2, m3, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_3(b0, b1) \
do { \
b0 = vec_perm(m4, m5, LL_MASK); \
b1 = vec_perm(m6, m7, LL_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_4(b0, b1) \
do { \
b0 = vec_perm(m4, m5, HH_MASK); \
b1 = vec_perm(m6, m7, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_1(b0, b1) \
do { \
b0 = vec_perm(m7, m2, LL_MASK); \
b1 = vec_perm(m4, m6, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_2(b0, b1) \
do { \
b0 = vec_perm(m5, m4, LL_MASK); \
b1 = vec_ext(m7, m3, 1); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_3(b0, b1) \
do { \
b0 = vec_ext(m0, m0, 1); \
b1 = vec_perm(m5, m2, HH_MASK); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_4(b0, b1) \
do { \
b0 = vec_perm(m6, m1, LL_MASK); \
b1 = vec_perm(m3, m1, HH_MASK); \
} while(0)
// Power8 has packed 64-bit rotate, but in terms of left rotate
const uint64x2_p ROR16_MASK = { 64-16, 64-16 };
const uint64x2_p ROR24_MASK = { 64-24, 64-24 };
const uint64x2_p ROR32_MASK = { 64-32, 64-32 };
const uint64x2_p ROR63_MASK = { 64-63, 64-63 };
#define vec_ror_32(x) vec_rl(x, ROR32_MASK)
#define vec_ror_24(x) vec_rl(x, ROR24_MASK)
#define vec_ror_16(x) vec_rl(x, ROR16_MASK)
#define vec_ror_63(x) vec_rl(x, ROR63_MASK)
#define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
do { \
row1l = vec_add(vec_add(row1l, b0), row2l); \
row1h = vec_add(vec_add(row1h, b1), row2h); \
row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \
row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \
row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \
row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \
row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \
} while(0)
#define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
do { \
row1l = vec_add(vec_add(row1l, b0), row2l); \
row1h = vec_add(vec_add(row1h, b1), row2h); \
row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \
row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \
row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \
row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \
row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \
} while(0)
#define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
do { \
uint64x2_p t0 = vec_ext(row2l, row2h, 1); \
uint64x2_p t1 = vec_ext(row2h, row2l, 1); \
row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
t0 = vec_ext(row4h, row4l, 1); t1 = vec_ext(row4l, row4h, 1); \
row4l = t0; row4h = t1; \
} while(0)
#define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
do { \
uint64x2_p t0 = vec_ext(row2h, row2l, 1); \
uint64x2_p t1 = vec_ext(row2l, row2h, 1); \
row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
t0 = vec_ext(row4l, row4h, 1); t1 = vec_ext(row4h, row4l, 1); \
row4l = t0; row4h = t1; \
} while(0)
#define BLAKE2B_ROUND(r) \
do { \
uint64x2_p b0, b1; \
BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
} while(0)
const uint64x2_p m0 = VectorLoad64LE(input + 00);
const uint64x2_p m1 = VectorLoad64LE(input + 16);
const uint64x2_p m2 = VectorLoad64LE(input + 32);
const uint64x2_p m3 = VectorLoad64LE(input + 48);
const uint64x2_p m4 = VectorLoad64LE(input + 64);
const uint64x2_p m5 = VectorLoad64LE(input + 80);
const uint64x2_p m6 = VectorLoad64LE(input + 96);
const uint64x2_p m7 = VectorLoad64LE(input + 112);
uint64x2_p row1l, row1h, row2l, row2h;
uint64x2_p row3l, row3h, row4l, row4h;
const uint64x2_p h0 = row1l = VectorLoad64LE(&state.h[0]);
const uint64x2_p h1 = row1h = VectorLoad64LE(&state.h[2]);
const uint64x2_p h2 = row2l = VectorLoad64LE(&state.h[4]);
const uint64x2_p h3 = row2h = VectorLoad64LE(&state.h[6]);
row3l = VectorLoad64(&BLAKE2B_IV[0]);
row3h = VectorLoad64(&BLAKE2B_IV[2]);
row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.t[0]));
row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.f[0]));
BLAKE2B_ROUND(0);
BLAKE2B_ROUND(1);
BLAKE2B_ROUND(2);
BLAKE2B_ROUND(3);
BLAKE2B_ROUND(4);
BLAKE2B_ROUND(5);
BLAKE2B_ROUND(6);
BLAKE2B_ROUND(7);
BLAKE2B_ROUND(8);
BLAKE2B_ROUND(9);
BLAKE2B_ROUND(10);
BLAKE2B_ROUND(11);
VectorStore64LE(&state.h[0], vec_xor(h0, vec_xor(row1l, row3l)));
VectorStore64LE(&state.h[2], vec_xor(h1, vec_xor(row1h, row3h)));
VectorStore64LE(&state.h[4], vec_xor(h2, vec_xor(row2l, row4l)));
VectorStore64LE(&state.h[6], vec_xor(h3, vec_xor(row2h, row4h)));
}
#endif // POWER8
NAMESPACE_END

View File

@ -14,6 +14,7 @@
// Do so in both blake2.cpp and blake2-simd.cpp.
// #undef CRYPTOPP_SSE41_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
// #undef CRYPTOPP_POWER8_AVAILABLE
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
@ -148,6 +149,10 @@ extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state);
#endif
#if CRYPTOPP_POWER8_AVAILABLE
extern void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2_State<word64, true>& state);
#endif
BLAKE2_ParameterBlock<false>::BLAKE2_ParameterBlock(size_t digestLen, size_t keyLen,
const byte* saltStr, size_t saltLen,
const byte* personalizationStr, size_t personalizationLen)
@ -340,7 +345,8 @@ void BLAKE2_Base<word64, true>::UncheckedSetKey(const byte *key, unsigned int le
}
}
std::string BLAKE2_Base_AlgorithmProvider()
template <class W, bool T_64bit>
std::string BLAKE2_Base<W, T_64bit>::AlgorithmProvider() const
{
#if defined(CRYPTOPP_SSE41_AVAILABLE)
if (HasSSE41())
@ -349,16 +355,14 @@ std::string BLAKE2_Base_AlgorithmProvider()
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return "NEON";
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8() && T_64bit == true)
return "Power8";
#endif
return "C++";
}
template <class W, bool T_64bit>
std::string BLAKE2_Base<W, T_64bit>::AlgorithmProvider() const
{
return BLAKE2_Base_AlgorithmProvider();
}
template <class W, bool T_64bit>
BLAKE2_Base<W, T_64bit>::BLAKE2_Base() : m_state(1), m_block(1), m_digestSize(DIGESTSIZE), m_treeMode(false)
{
@ -513,6 +517,12 @@ void BLAKE2_Base<word64, true>::Compress(const byte *input)
{
return BLAKE2_Compress64_NEON(input, *m_state.data());
}
#endif
#if CRYPTOPP_POWER8_AVAILABLE
if(HasPower8())
{
return BLAKE2_Compress64_POWER8(input, *m_state.data());
}
#endif
return BLAKE2_Compress64_CXX(input, *m_state.data());
}

View File

@ -12,7 +12,8 @@
/// \details The library provides specialized SSE2, SSE4 and NEON version of the BLAKE2 compression
/// function. For best results under ARM NEON, specify both an architecture and cpu. For example:
/// <pre>CXXFLAGS="-DNDEBUG -march=armv8-a+crc -mcpu=cortex-a53 ..."</pre>
/// \since Crypto++ 5.6.4
/// \since C++ since Crypto++ 5.6.4, SSE since Crypto++ 5.6.4, NEON since Crypto++ 6.0,
/// BLAKE2b Power8 since Crypto++ 8.0
#ifndef CRYPTOPP_BLAKE2_H
#define CRYPTOPP_BLAKE2_H