mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2025-02-11 15:55:19 +00:00
Use little-endian mask during BLAKE2 loads
This commit is contained in:
parent
dc2b336ace
commit
25cdab6d32
@ -763,13 +763,13 @@ inline uint64x2_p VecLoad64(const void* p)
|
||||
return (uint64x2_p)VecLoad((const byte*)p);
|
||||
}
|
||||
|
||||
inline uint64x2_p VecLoad64LE(const void* p)
|
||||
inline uint64x2_p VecLoad64LE(const void* p, const uint8x16_p le_mask)
|
||||
{
|
||||
#if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
||||
const uint64x2_p v = VecLoad64(p);
|
||||
return (uint64x2_p)VecPermute(v, v, m);
|
||||
return (uint64x2_p)VecPermute(v, v, le_mask);
|
||||
#else
|
||||
CRYPTOPP_UNUSED(le_mask);
|
||||
return (uint64x2_p)VecLoad64(p);
|
||||
#endif
|
||||
}
|
||||
@ -779,12 +779,12 @@ inline void VecStore64(void* p, const uint64x2_p x)
|
||||
VecStore((uint8x16_p)x, (byte*)p);
|
||||
}
|
||||
|
||||
inline void VecStore64LE(void* p, const uint64x2_p x)
|
||||
inline void VecStore64LE(void* p, const uint64x2_p x, const uint8x16_p le_mask)
|
||||
{
|
||||
#if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
||||
VecStore64(p, VecPermute(x, x, m));
|
||||
VecStore64(p, VecPermute(x, x, le_mask));
|
||||
#else
|
||||
CRYPTOPP_UNUSED(le_mask);
|
||||
VecStore64(p, x);
|
||||
#endif
|
||||
}
|
||||
@ -1155,22 +1155,24 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
||||
BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
} while(0)
|
||||
|
||||
const uint64x2_p m0 = VecLoad64LE(input + 00);
|
||||
const uint64x2_p m1 = VecLoad64LE(input + 16);
|
||||
const uint64x2_p m2 = VecLoad64LE(input + 32);
|
||||
const uint64x2_p m3 = VecLoad64LE(input + 48);
|
||||
const uint64x2_p m4 = VecLoad64LE(input + 64);
|
||||
const uint64x2_p m5 = VecLoad64LE(input + 80);
|
||||
const uint64x2_p m6 = VecLoad64LE(input + 96);
|
||||
const uint64x2_p m7 = VecLoad64LE(input + 112);
|
||||
const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
||||
|
||||
const uint64x2_p m0 = VecLoad64LE(input + 00, le_mask);
|
||||
const uint64x2_p m1 = VecLoad64LE(input + 16, le_mask);
|
||||
const uint64x2_p m2 = VecLoad64LE(input + 32, le_mask);
|
||||
const uint64x2_p m3 = VecLoad64LE(input + 48, le_mask);
|
||||
const uint64x2_p m4 = VecLoad64LE(input + 64, le_mask);
|
||||
const uint64x2_p m5 = VecLoad64LE(input + 80, le_mask);
|
||||
const uint64x2_p m6 = VecLoad64LE(input + 96, le_mask);
|
||||
const uint64x2_p m7 = VecLoad64LE(input + 112, le_mask);
|
||||
|
||||
uint64x2_p row1l, row1h, row2l, row2h;
|
||||
uint64x2_p row3l, row3h, row4l, row4h;
|
||||
|
||||
const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0);
|
||||
const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2);
|
||||
const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4);
|
||||
const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6);
|
||||
const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0, le_mask);
|
||||
const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2, le_mask);
|
||||
const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4, le_mask);
|
||||
const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6, le_mask);
|
||||
|
||||
row3l = VecLoad64(BLAKE2B_IV+0);
|
||||
row3h = VecLoad64(BLAKE2B_IV+2);
|
||||
@ -1190,10 +1192,10 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
|
||||
BLAKE2B_ROUND(10);
|
||||
BLAKE2B_ROUND(11);
|
||||
|
||||
VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)));
|
||||
VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)));
|
||||
VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)));
|
||||
VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)));
|
||||
VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)), le_mask);
|
||||
VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)), le_mask);
|
||||
VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)), le_mask);
|
||||
VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)), le_mask);
|
||||
}
|
||||
#endif // CRYPTOPP_POWER8_AVAILABLE
|
||||
|
||||
|
@ -706,13 +706,13 @@ inline uint32x4_p VecLoad32(const T* p)
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline uint32x4_p VecLoad32LE(const T* p)
|
||||
inline uint32x4_p VecLoad32LE(const T* p, const uint8x16_p le_mask)
|
||||
{
|
||||
#if __BIG_ENDIAN__
|
||||
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
||||
#if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
const uint32x4_p v = VecLoad(p);
|
||||
return VecPermute(v, v, m);
|
||||
return VecPermute(v, v, le_mask);
|
||||
#else
|
||||
CRYPTOPP_UNUSED(le_mask);
|
||||
return VecLoad(p);
|
||||
#endif
|
||||
}
|
||||
@ -724,12 +724,13 @@ inline void VecStore32(T* p, const uint32x4_p x)
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline void VecStore32LE(T* p, const uint32x4_p x)
|
||||
inline void VecStore32LE(T* p, const uint32x4_p x, const uint8x16_p le_mask)
|
||||
{
|
||||
#if __BIG_ENDIAN__
|
||||
const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
|
||||
VecStore(VecPermute(x, x, m), p);
|
||||
#if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
const uint32x4_p v = VecPermute(x, x, le_mask);
|
||||
VecStore(v, p);
|
||||
#else
|
||||
CRYPTOPP_UNUSED(le_mask);
|
||||
VecStore(x, p);
|
||||
#endif
|
||||
}
|
||||
@ -991,17 +992,19 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
|
||||
BLAKE2S_G2(row1,row2,row3,row4,buf4); \
|
||||
BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4);
|
||||
|
||||
const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
|
||||
|
||||
const uint32x4_p m0 = VecLoad32LE(input + 0, le_mask);
|
||||
const uint32x4_p m4 = VecLoad32LE(input + 16, le_mask);
|
||||
const uint32x4_p m8 = VecLoad32LE(input + 32, le_mask);
|
||||
const uint32x4_p m12 = VecLoad32LE(input + 48, le_mask);
|
||||
|
||||
uint32x4_p row1, row2, row3, row4;
|
||||
uint32x4_p buf1, buf2, buf3, buf4;
|
||||
uint32x4_p ff0, ff1;
|
||||
|
||||
const uint32x4_p m0 = VecLoad32LE(input + 0);
|
||||
const uint32x4_p m4 = VecLoad32LE(input + 16);
|
||||
const uint32x4_p m8 = VecLoad32LE(input + 32);
|
||||
const uint32x4_p m12 = VecLoad32LE(input + 48);
|
||||
|
||||
row1 = ff0 = VecLoad32LE(state.h()+0);
|
||||
row2 = ff1 = VecLoad32LE(state.h()+4);
|
||||
row1 = ff0 = VecLoad32LE(state.h()+0, le_mask);
|
||||
row2 = ff1 = VecLoad32LE(state.h()+4, le_mask);
|
||||
row3 = VecLoad32(BLAKE2S_IV+0);
|
||||
row4 = VecXor(VecLoad32(BLAKE2S_IV+4), VecLoad32(state.t()+0));
|
||||
|
||||
@ -1016,8 +1019,8 @@ void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
|
||||
BLAKE2S_ROUND(8);
|
||||
BLAKE2S_ROUND(9);
|
||||
|
||||
VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)));
|
||||
VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)));
|
||||
VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)), le_mask);
|
||||
VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)), le_mask);
|
||||
}
|
||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
|
||||
|
@ -2754,11 +2754,4 @@ NAMESPACE_END
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
#undef CONST_V8_CAST
|
||||
#undef CONST_V32_CAST
|
||||
#undef CONST_V64_CAST
|
||||
#undef NCONST_V8_CAST
|
||||
#undef NCONST_V32_CAST
|
||||
#undef NCONST_V64_CAST
|
||||
|
||||
#endif // CRYPTOPP_PPC_CRYPTO_H
|
||||
|
Loading…
x
Reference in New Issue
Block a user