mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-26 19:30:21 +00:00
Speedup BLAKE2s message loading on PowerPC
This commit is contained in:
parent
a3aefbb1dc
commit
827f2ebcad
@ -1158,6 +1158,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
||||
|
||||
/* Possibly unaligned user messages */
|
||||
uint64x2_p m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
/* Endian conversion mask */
|
||||
const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
||||
|
||||
#if defined(_ARCH_PWR9)
|
||||
@ -1183,7 +1184,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
||||
# endif
|
||||
#else
|
||||
/* Altivec only provides 16-byte aligned loads */
|
||||
/* http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf */
|
||||
/* http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf, Section 3.16 */
|
||||
m0 = (uint64x2_p) vec_ld( 0, CONST_V8_CAST( input ));
|
||||
m1 = (uint64x2_p) vec_ld( 16, CONST_V8_CAST( input ));
|
||||
m2 = (uint64x2_p) vec_ld( 32, CONST_V8_CAST( input ));
|
||||
|
@ -992,12 +992,64 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
|
||||
BLAKE2S_G2(row1,row2,row3,row4,buf4); \
|
||||
BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4);
|
||||
|
||||
/* Possibly unaligned user messages */
|
||||
uint32x4_p m0, m4, m8, m12;
|
||||
/* Endian conversion mask */
|
||||
const uint8x16_p le_mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
||||
|
||||
const uint32x4_p m0 = VecLoad32LE(input + 0, le_mask);
|
||||
const uint32x4_p m4 = VecLoad32LE(input + 16, le_mask);
|
||||
const uint32x4_p m8 = VecLoad32LE(input + 32, le_mask);
|
||||
const uint32x4_p m12 = VecLoad32LE(input + 48, le_mask);
|
||||
#if defined(_ARCH_PWR9)
|
||||
/* POWER9 provides loads for char's and short's */
|
||||
m0 = (uint32x4_p) vec_xl( 0, CONST_V8_CAST( input ));
|
||||
m4 = (uint32x4_p) vec_xl( 16, CONST_V8_CAST( input ));
|
||||
m8 = (uint32x4_p) vec_xl( 32, CONST_V8_CAST( input ));
|
||||
m12 = (uint32x4_p) vec_xl( 48, CONST_V8_CAST( input ));
|
||||
|
||||
# if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
m0 = vec_perm(m0, m0, le_mask);
|
||||
m4 = vec_perm(m4, m4, le_mask);
|
||||
m8 = vec_perm(m8, m8, le_mask);
|
||||
m12 = vec_perm(m12, m12, le_mask);
|
||||
# endif
|
||||
#else
|
||||
/* Altivec only provides 16-byte aligned loads */
|
||||
/* http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf, Section 3.16 */
|
||||
m0 = (uint32x4_p) vec_ld( 0, CONST_V8_CAST( input ));
|
||||
m4 = (uint32x4_p) vec_ld( 16, CONST_V8_CAST( input ));
|
||||
m8 = (uint32x4_p) vec_ld( 32, CONST_V8_CAST( input ));
|
||||
m12 = (uint32x4_p) vec_ld( 48, CONST_V8_CAST( input ));
|
||||
|
||||
/* Alignment check for load of the message buffer */
|
||||
const uintptr_t addr = (uintptr_t)input;
|
||||
if (addr%16 == 0)
|
||||
{
|
||||
/* Already aligned. Perform a little-endian swap as required */
|
||||
# if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
m0 = vec_perm(m0, m0, le_mask);
|
||||
m4 = vec_perm(m4, m4, le_mask);
|
||||
m8 = vec_perm(m8, m8, le_mask);
|
||||
m12 = vec_perm(m12, m12, le_mask);
|
||||
# endif
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Not aligned. Fix vectors and perform a little-endian swap as required */
|
||||
// http://mirror.informatimago.com/next/developer.apple.com/
|
||||
// hardwaredrivers/ve/code_optimization.html
|
||||
uint32x4_p ex; uint8x16_p perm;
|
||||
ex = (uint32x4_p) vec_ld(48+15, CONST_V8_CAST( input ));
|
||||
perm = vec_lvsl(0, CONST_V8_CAST( addr ));
|
||||
|
||||
# if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
/* Combine the vector permute with the little-endian swap */
|
||||
perm = vec_perm(perm, perm, le_mask);
|
||||
# endif
|
||||
|
||||
m0 = vec_perm(m0, m4, perm);
|
||||
m4 = vec_perm(m4, m8, perm);
|
||||
m8 = vec_perm(m8, m12, perm);
|
||||
m12 = vec_perm(m12, ex, perm);
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32x4_p row1, row2, row3, row4;
|
||||
uint32x4_p buf1, buf2, buf3, buf4;
|
||||
|
Loading…
Reference in New Issue
Block a user