Use little-endian mask during BLAKE2 loads

This commit is contained in:
Jeffrey Walton 2020-06-28 02:34:52 -04:00
parent dc2b336ace
commit 25cdab6d32
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 44 additions and 46 deletions

View File

@ -763,13 +763,13 @@ inline uint64x2_p VecLoad64(const void* p)
return (uint64x2_p)VecLoad((const byte*)p);
}
inline uint64x2_p VecLoad64LE(const void* p)
inline uint64x2_p VecLoad64LE(const void* p, const uint8x16_p le_mask)
{
#if defined(CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
const uint64x2_p v = VecLoad64(p);
return (uint64x2_p)VecPermute(v, v, m);
return (uint64x2_p)VecPermute(v, v, le_mask);
#else
CRYPTOPP_UNUSED(le_mask);
return (uint64x2_p)VecLoad64(p);
#endif
}
@ -779,12 +779,12 @@ inline void VecStore64(void* p, const uint64x2_p x)
VecStore((uint8x16_p)x, (byte*)p);
}
inline void VecStore64LE(void* p, const uint64x2_p x)
inline void VecStore64LE(void* p, const uint64x2_p x, const uint8x16_p le_mask)
{
#if defined(CRYPTOPP_BIG_ENDIAN)
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
VecStore64(p, VecPermute(x, x, m));
VecStore64(p, VecPermute(x, x, le_mask));
#else
CRYPTOPP_UNUSED(le_mask);
VecStore64(p, x);
#endif
}
@ -1155,22 +1155,24 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
} while(0)
const uint64x2_p m0 = VecLoad64LE(input + 00);
const uint64x2_p m1 = VecLoad64LE(input + 16);
const uint64x2_p m2 = VecLoad64LE(input + 32);
const uint64x2_p m3 = VecLoad64LE(input + 48);
const uint64x2_p m4 = VecLoad64LE(input + 64);
const uint64x2_p m5 = VecLoad64LE(input + 80);
const uint64x2_p m6 = VecLoad64LE(input + 96);
const uint64x2_p m7 = VecLoad64LE(input + 112);
const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
const uint64x2_p m0 = VecLoad64LE(input + 00, le_mask);
const uint64x2_p m1 = VecLoad64LE(input + 16, le_mask);
const uint64x2_p m2 = VecLoad64LE(input + 32, le_mask);
const uint64x2_p m3 = VecLoad64LE(input + 48, le_mask);
const uint64x2_p m4 = VecLoad64LE(input + 64, le_mask);
const uint64x2_p m5 = VecLoad64LE(input + 80, le_mask);
const uint64x2_p m6 = VecLoad64LE(input + 96, le_mask);
const uint64x2_p m7 = VecLoad64LE(input + 112, le_mask);
uint64x2_p row1l, row1h, row2l, row2h;
uint64x2_p row3l, row3h, row4l, row4h;
const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0);
const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2);
const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4);
const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6);
const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0, le_mask);
const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2, le_mask);
const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4, le_mask);
const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6, le_mask);
row3l = VecLoad64(BLAKE2B_IV+0);
row3h = VecLoad64(BLAKE2B_IV+2);
@ -1190,10 +1192,10 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
BLAKE2B_ROUND(10);
BLAKE2B_ROUND(11);
VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)));
VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)));
VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)));
VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)));
VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)), le_mask);
VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)), le_mask);
VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)), le_mask);
VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)), le_mask);
}
#endif // CRYPTOPP_POWER8_AVAILABLE

View File

@ -706,13 +706,13 @@ inline uint32x4_p VecLoad32(const T* p)
}
template <class T>
inline uint32x4_p VecLoad32LE(const T* p)
inline uint32x4_p VecLoad32LE(const T* p, const uint8x16_p le_mask)
{
#if __BIG_ENDIAN__
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
#if defined(CRYPTOPP_BIG_ENDIAN)
const uint32x4_p v = VecLoad(p);
return VecPermute(v, v, m);
return VecPermute(v, v, le_mask);
#else
CRYPTOPP_UNUSED(le_mask);
return VecLoad(p);
#endif
}
@ -724,12 +724,13 @@ inline void VecStore32(T* p, const uint32x4_p x)
}
template <class T>
inline void VecStore32LE(T* p, const uint32x4_p x)
inline void VecStore32LE(T* p, const uint32x4_p x, const uint8x16_p le_mask)
{
#if __BIG_ENDIAN__
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
VecStore(VecPermute(x, x, m), p);
#if defined(CRYPTOPP_BIG_ENDIAN)
const uint32x4_p v = VecPermute(x, x, le_mask);
VecStore(v, p);
#else
CRYPTOPP_UNUSED(le_mask);
VecStore(x, p);
#endif
}
@ -991,17 +992,19 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
BLAKE2S_G2(row1,row2,row3,row4,buf4); \
BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4);
const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
const uint32x4_p m0 = VecLoad32LE(input + 0, le_mask);
const uint32x4_p m4 = VecLoad32LE(input + 16, le_mask);
const uint32x4_p m8 = VecLoad32LE(input + 32, le_mask);
const uint32x4_p m12 = VecLoad32LE(input + 48, le_mask);
uint32x4_p row1, row2, row3, row4;
uint32x4_p buf1, buf2, buf3, buf4;
uint32x4_p ff0, ff1;
const uint32x4_p m0 = VecLoad32LE(input + 0);
const uint32x4_p m4 = VecLoad32LE(input + 16);
const uint32x4_p m8 = VecLoad32LE(input + 32);
const uint32x4_p m12 = VecLoad32LE(input + 48);
row1 = ff0 = VecLoad32LE(state.h()+0);
row2 = ff1 = VecLoad32LE(state.h()+4);
row1 = ff0 = VecLoad32LE(state.h()+0, le_mask);
row2 = ff1 = VecLoad32LE(state.h()+4, le_mask);
row3 = VecLoad32(BLAKE2S_IV+0);
row4 = VecXor(VecLoad32(BLAKE2S_IV+4), VecLoad32(state.t()+0));
@ -1016,8 +1019,8 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)));
VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)));
VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)), le_mask);
VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)), le_mask);
}
#endif // CRYPTOPP_ALTIVEC_AVAILABLE

View File

@ -2754,11 +2754,4 @@ NAMESPACE_END
# pragma GCC diagnostic pop
#endif
#undef CONST_V8_CAST
#undef CONST_V32_CAST
#undef CONST_V64_CAST
#undef NCONST_V8_CAST
#undef NCONST_V32_CAST
#undef NCONST_V64_CAST
#endif // CRYPTOPP_PPC_CRYPTO_H