mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2024-11-26 19:30:21 +00:00
Add XOP aware CHAM and LEA
This commit is contained in:
parent
210995b867
commit
babdf8b38b
@ -63,7 +63,7 @@ template <>
|
||||
inline __m128i RotateLeft<8>(const __m128i val)
|
||||
{
|
||||
#ifdef __XOP__
|
||||
return _mm_roti_epi32(val, R);
|
||||
return _mm_roti_epi32(val, 8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
@ -74,7 +74,7 @@ template <>
|
||||
inline __m128i RotateLeft<16>(const __m128i val)
|
||||
{
|
||||
#ifdef __XOP__
|
||||
return _mm_roti_epi32(val, R);
|
||||
return _mm_roti_epi32(val, 16);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
|
@ -22,6 +22,10 @@
|
||||
# include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__XOP__)
|
||||
# include <ammintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
# define CRYPTOPP_AVX512_ROTATE 1
|
||||
# include <immintrin.h>
|
||||
@ -44,31 +48,47 @@ NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateLeft16(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi16(val, R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateRight16(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi16(val, 16-R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline __m128i RotateLeft16<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi16(val, 8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline __m128i RotateRight16<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi16(val, 16-8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
@ -626,6 +646,8 @@ inline __m128i RotateLeft32(const __m128i& val)
|
||||
{
|
||||
#if defined(CRYPTOPP_AVX512_ROTATE)
|
||||
return _mm_rol_epi32(val, R);
|
||||
#elif defined(__XOP__)
|
||||
return _mm_roti_epi32(val, R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
|
||||
@ -637,6 +659,8 @@ inline __m128i RotateRight32(const __m128i& val)
|
||||
{
|
||||
#if defined(CRYPTOPP_AVX512_ROTATE)
|
||||
return _mm_ror_epi32(val, R);
|
||||
#elif defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
|
||||
@ -647,16 +671,24 @@ inline __m128i RotateRight32(const __m128i& val)
|
||||
template <>
|
||||
inline __m128i RotateLeft32<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
|
||||
template <>
|
||||
inline __m128i RotateRight32<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
|
20
lea-simd.cpp
20
lea-simd.cpp
@ -22,6 +22,10 @@
|
||||
# include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__XOP__)
|
||||
# include <ammintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
# define CRYPTOPP_AVX512_ROTATE 1
|
||||
# include <immintrin.h>
|
||||
@ -279,31 +283,47 @@ inline __m128i Sub(const __m128i& a, const __m128i& b)
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateLeft(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline __m128i RotateRight(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-R);
|
||||
#else
|
||||
return _mm_or_si128(
|
||||
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or.
|
||||
template <>
|
||||
inline __m128i RotateLeft<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Faster than two Shifts and an Or.
|
||||
template <>
|
||||
inline __m128i RotateRight<8>(const __m128i& val)
|
||||
{
|
||||
#if defined(__XOP__)
|
||||
return _mm_roti_epi32(val, 32-8);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
|
Loading…
Reference in New Issue
Block a user