Add XOP aware CHAM and LEA

2024-11-26 19:30:21 +00:00 · 2018-10-24 17:12:03 -04:00 · 2018-10-24 17:12:03 -04:00 · babdf8b38b
commit babdf8b38b
parent 210995b867
3 changed files with 54 additions and 2 deletions
--- a/chacha-simd.cpp
+++ b/chacha-simd.cpp
@ -63,7 +63,7 @@ template <>
 inline __m128i RotateLeft<8>(const __m128i val)
 {
 #ifdef __XOP__
-	return _mm_roti_epi32(val, R);
+	return _mm_roti_epi32(val, 8);
 #else
 	const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
 	return _mm_shuffle_epi8(val, mask);
@ -74,7 +74,7 @@ template <>
 inline __m128i RotateLeft<16>(const __m128i val)
 {
 #ifdef __XOP__
-	return _mm_roti_epi32(val, R);
+	return _mm_roti_epi32(val, 16);
 #else
 	const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
 	return _mm_shuffle_epi8(val, mask);
--- a/cham-simd.cpp
+++ b/cham-simd.cpp
@ -22,6 +22,10 @@
 # include <tmmintrin.h>
 #endif

+#if defined(__XOP__)
+# include <ammintrin.h>
+#endif
+
 #if defined(__AVX512F__) && defined(__AVX512VL__)
 # define CRYPTOPP_AVX512_ROTATE 1
 # include <immintrin.h>
@ -44,31 +48,47 @@ NAMESPACE_BEGIN(W16)  // CHAM64, 16-bit word size
 template <unsigned int R>
 inline __m128i RotateLeft16(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi16(val, R);
+#else
    return _mm_or_si128(
        _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
+#endif
 }

 template <unsigned int R>
 inline __m128i RotateRight16(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi16(val, 16-R);
+#else
    return _mm_or_si128(
        _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
+#endif
 }

 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
 template <>
 inline __m128i RotateLeft16<8>(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi16(val, 8);
+#else
    const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
    return _mm_shuffle_epi8(val, mask);
+#endif
 }

 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
 template <>
 inline __m128i RotateRight16<8>(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi16(val, 16-8);
+#else
    const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
    return _mm_shuffle_epi8(val, mask);
+#endif
 }

 template <unsigned int IDX>
@ -626,6 +646,8 @@ inline __m128i RotateLeft32(const __m128i& val)
 {
 #if defined(CRYPTOPP_AVX512_ROTATE)
    return _mm_rol_epi32(val, R);
+#elif defined(__XOP__)
+    return _mm_roti_epi32(val, R);
 #else
    return _mm_or_si128(
        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
@ -637,6 +659,8 @@ inline __m128i RotateRight32(const __m128i& val)
 {
 #if defined(CRYPTOPP_AVX512_ROTATE)
    return _mm_ror_epi32(val, R);
+#elif defined(__XOP__)
+    return _mm_roti_epi32(val, 32-R);
 #else
    return _mm_or_si128(
        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
@ -647,16 +671,24 @@ inline __m128i RotateRight32(const __m128i& val)
 template <>
 inline __m128i RotateLeft32<8>(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi32(val, 8);
+#else
    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
    return _mm_shuffle_epi8(val, mask);
+#endif
 }

 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
 template <>
 inline __m128i RotateRight32<8>(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi32(val, 32-8);
+#else
    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
    return _mm_shuffle_epi8(val, mask);
+#endif
 }

 template <unsigned int IDX>
--- a/lea-simd.cpp
+++ b/lea-simd.cpp
@ -22,6 +22,10 @@
 # include <tmmintrin.h>
 #endif

+#if defined(__XOP__)
+# include <ammintrin.h>
+#endif
+
 #if defined(__AVX512F__) && defined(__AVX512VL__)
 # define CRYPTOPP_AVX512_ROTATE 1
 # include <immintrin.h>
@ -279,31 +283,47 @@ inline __m128i Sub(const __m128i& a, const __m128i& b)
 template <unsigned int R>
 inline __m128i RotateLeft(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi32(val, R);
+#else
    return _mm_or_si128(
        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
+#endif
 }

 template <unsigned int R>
 inline __m128i RotateRight(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi32(val, 32-R);
+#else
    return _mm_or_si128(
        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
+#endif
 }

 // Faster than two Shifts and an Or.
 template <>
 inline __m128i RotateLeft<8>(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi32(val, 8);
+#else
    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
    return _mm_shuffle_epi8(val, mask);
+#endif
 }

 // Faster than two Shifts and an Or.
 template <>
 inline __m128i RotateRight<8>(const __m128i& val)
 {
+#if defined(__XOP__)
+    return _mm_roti_epi32(val, 32-8);
+#else
    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
    return _mm_shuffle_epi8(val, mask);
+#endif
 }

 template <unsigned int IDX>