Fix Altive VectorStore on little-endian

Remove unneeded VectorLeftShift(a,b) and VectorRightShift(a,b)
2024-11-23 18:09:48 +00:00 · 2018-08-08 19:59:14 -04:00 · 2018-08-08 19:59:14 -04:00 · 00e7d02a8a
commit 00e7d02a8a
parent 96405e14ec
1 changed files with 128 additions and 109 deletions
--- a/ppc-simd.h
+++ b/ppc-simd.h
@ -71,7 +71,7 @@ template <class T>
 inline T Reverse(const T& src)
 {
    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
-    return vec_perm(src, src, mask);
+    return (T)vec_perm(src, src, mask);
 }
 /// \brief Permutes two vectors
@ -147,40 +147,6 @@ inline T1 VectorAdd(const T1& vec1, const T2& vec2)
    return (T1)vec_add(vec1, (T1)vec2);
 }
 /// \brief Shift two vectors left
 /// \tparam C shift byte count
 /// \tparam T1 vector type
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \details VectorShiftLeft() concatenates vec1 and vec2 and returns a
 ///   new vector after shifting the concatenation by the specified number
 ///   of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return
 ///   vector is the same type as vec1.
 /// \details On big endian machines VectorShiftLeft() is <tt>vec_sld(a, b,
 ///   c)</tt>. On little endian machines VectorShiftLeft() is translated to
 ///   <tt>vec_sld(b, a, 16-c)</tt>. You should always call the function as
 ///   if on a big endian machine as shown below.
 /// <pre>
 ///    uint8x16_p r0 = {0};
 ///    uint8x16_p r1 = VectorLoad(ptr);
 ///    uint8x16_p r5 = VectorShiftLeft<12>(r0, r1);
 /// </pre>
 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
 ///   endian sensitive?</A> on Stack Overflow
 /// \since Crypto++ 6.0
 template <unsigned int C, class T1, class T2>
 inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2)
 {
 #if CRYPTOPP_BIG_ENDIAN
    enum { R=(C)&0xf };
    return (T1)vec_sld((uint8x16_p)vec1, (uint8x16_p)vec2, R);
 #else
    enum { R=(16-C)&0xf };
    return (T1)vec_sld((uint8x16_p)vec2, (uint8x16_p)vec1, R);
 #endif
 }
 /// \brief Shift a vector left
 /// \tparam C shift byte count
 /// \tparam T vector type
@ -203,16 +169,40 @@ template <unsigned int C, class T>
 inline T VectorShiftLeft(const T& vec)
 {
 #if CRYPTOPP_BIG_ENDIAN
-    enum { R=(C)&0xf };
+    enum { R=(C)&0xf, S=R };
-    const T zero = VectorXor(vec, vec);
+    const T zero = {0};
-    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
+    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, S);
 #else
-    enum { R=(16-C)&0xf };
+    enum { R=(16-C)&0xf, S=R };
-    const T zero = VectorXor(vec, vec);
+    const T zero = {0};
-    return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
+    return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, S);
 #endif
 }
 // Full specializations for 0 over uint8x16_p to uint64x2_p
 template<>
 inline uint8x16_p VectorShiftLeft<0, uint8x16_p>(const uint8x16_p& vec)
 {
    return vec;
 }
 template<>
 inline uint16x8_p VectorShiftLeft<0, uint16x8_p>(const uint16x8_p& vec)
 {
    return vec;
 }
 template<>
 inline uint32x4_p VectorShiftLeft<0, uint32x4_p>(const uint32x4_p& vec)
 {
    return vec;
 }
 #if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 template<>
 inline uint64x2_p VectorShiftLeft<0, uint64x2_p>(const uint64x2_p& vec)
 {
    return vec;
 }
 #endif
 /// \brief Shift a vector right
 /// \tparam C shift byte count
 /// \tparam T vector type
@ -235,48 +225,86 @@ template <unsigned int C, class T>
 inline T VectorShiftRight(const T& vec)
 {
 #if CRYPTOPP_BIG_ENDIAN
-    enum { R=(C)&0xf };
+    enum { R=(16-C)&0xf, S=R };
-    const T zero = VectorXor(vec, vec);
+    const T zero = {0};
-    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
+    return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, S);
 #else
-    enum { R=(16-C)&0xf };
+    enum { R=(C)&0xf, S=R };
-    const T zero = VectorXor(vec, vec);
+    const T zero = {0};
-    return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
+    return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, S);
 #endif
 }
-/// \brief Shift two vectors right
+// Full specializations for 0 over uint8x16_p to uint64x2_p
-/// \tparam C shift byte count
+template<>
 inline uint8x16_p VectorShiftRight<0, uint8x16_p>(const uint8x16_p& vec)
 {
    return vec;
 }
 template<>
 inline uint16x8_p VectorShiftRight<0, uint16x8_p>(const uint16x8_p& vec)
 {
    return vec;
 }
 template<>
 inline uint32x4_p VectorShiftRight<0, uint32x4_p>(const uint32x4_p& vec)
 {
    return vec;
 }
 #if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 template<>
 inline uint64x2_p VectorShiftRight<0, uint64x2_p>(const uint64x2_p& vec)
 {
    return vec;
 }
 #endif
 // Full specializations for 16 over uint8x16_p to uint64x2_p
 template<>
 inline uint8x16_p VectorShiftRight<16, uint8x16_p>(const uint8x16_p& vec)
 {
    return vec;
 }
 template<>
 inline uint16x8_p VectorShiftRight<16, uint16x8_p>(const uint16x8_p& vec)
 {
    return vec;
 }
 template<>
 inline uint32x4_p VectorShiftRight<16, uint32x4_p>(const uint32x4_p& vec)
 {
    return vec;
 }
 #if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
 template<>
 inline uint64x2_p VectorShiftRight<16, uint64x2_p>(const uint64x2_p& vec)
 {
    return vec;
 }
 #endif
 /// \brief Compare two vectors
 /// \tparam T1 vector type
 /// \tparam T2 vector type
 /// \param vec1 the first vector
 /// \param vec2 the second vector
-/// \details VectorShiftRight() concatenates vec1 and vec2 and returns a
+/// \returns true if vec1 equals vec2, false otherwise
-///   new vector after shifting the concatenation by the specified number
+template <class T1, class T2>
-///   of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return
+inline bool VectorEqual(const T1& vec1, const T2& vec2)
 ///   vector is the same type as vec1.
 /// \details On big endian machines VectorShiftRight() is <tt>vec_sld(b, a,
 ///   16-c)</tt>. On little endian machines VectorShiftRight() is translated to
 ///   <tt>vec_sld(a, b, c)</tt>. You should always call the function as
 ///   if on a big endian machine as shown below.
 /// <pre>
 ///    uint8x16_p r0 = {0};
 ///    uint8x16_p r1 = VectorLoad(ptr);
 ///    uint8x16_p r5 = VectorShiftRight<12>(r0, r1);
 /// </pre>
 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
 ///   endian sensitive?</A> on Stack Overflow
 /// \since Crypto++ 6.0
 template <unsigned int C, class T1, class T2>
 inline T1 VectorShiftRight(const T1& vec1, const T2& vec2)
 {
-#if CRYPTOPP_BIG_ENDIAN
+    return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
-    enum { R=(C)&0xf };
+}
-    return (T1)vec_sld((uint8x16_p)vec1, (uint8x16_p)vec2, R);
+
-#else
+/// \brief Compare two vectors
-    enum { R=(16-C)&0xf };
+/// \tparam T1 vector type
-    return (T1)vec_sld((uint8x16_p)vec2, (uint8x16_p)vec1, R);
+/// \tparam T2 vector type
-#endif
+/// \param vec1 the first vector
 /// \param vec2 the second vector
 /// \returns true if vec1 does not equal vec2, false otherwise
 template <class T1, class T2>
 inline bool VectorNotEqual(const T1& vec1, const T2& vec2)
 {
    return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
 }
 #endif  // POWER4 and above
@ -296,10 +324,10 @@ inline uint32x4_p VectorLoadBE(const uint8_t src[16])
 #if defined(CRYPTOPP_XLC_VERSION)
    return (uint32x4_p)vec_xl_be(0, (byte*)src);
 #else
-# if defined(CRYPTOPP_LITTLE_ENDIAN)
+# if defined(CRYPTOPP_BIG_ENDIAN)
    return (uint32x4_p)Reverse(vec_vsx_ld(0, src));
 # else
    return (uint32x4_p)vec_vsx_ld(0, src);
 # else
    return (uint32x4_p)Reverse(vec_vsx_ld(0, src));
 # endif
 #endif
 }
@ -317,10 +345,10 @@ inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16])
 #if defined(CRYPTOPP_XLC_VERSION)
    return (uint32x4_p)vec_xl_be(off, (byte*)src);
 #else
-# if defined(CRYPTOPP_LITTLE_ENDIAN)
+# if defined(CRYPTOPP_BIG_ENDIAN)
    return (uint32x4_p)Reverse(vec_vsx_ld(off, src));
 # else
    return (uint32x4_p)vec_vsx_ld(off, src);
 # else
    return (uint32x4_p)Reverse(vec_vsx_ld(off, src));
 # endif
 #endif
 }
@ -371,10 +399,10 @@ inline void VectorStoreBE(const T& src, uint8_t dest[16])
 #if defined(CRYPTOPP_XLC_VERSION)
    vec_xst_be((uint8x16_p)src, 0, dest);
 #else
-# if defined(CRYPTOPP_LITTLE_ENDIAN)
+# if defined(CRYPTOPP_BIG_ENDIAN)
    vec_vsx_st(Reverse((uint8x16_p)src), 0, dest);
 # else
    vec_vsx_st((uint8x16_p)src, 0, dest);
 # else
    vec_vsx_st((uint8x16_p)Reverse(src), 0, dest);
 # endif
 #endif
 }
@ -395,10 +423,10 @@ inline void VectorStoreBE(const T& src, int off, uint8_t dest[16])
 #if defined(CRYPTOPP_XLC_VERSION)
    vec_xst_be((uint8x16_p)src, off, dest);
 #else
-# if defined(CRYPTOPP_LITTLE_ENDIAN)
+# if defined(CRYPTOPP_BIG_ENDIAN)
    vec_vsx_st(Reverse((uint8x16_p)src), off, dest);
 # else
    vec_vsx_st((uint8x16_p)src, off, dest);
 # else
    vec_vsx_st((uint8x16_p)Reverse(src), off, dest);
 # endif
 #endif
 }
@ -472,37 +500,29 @@ inline uint32x4_p VectorLoadBE(const uint8_t src[16])
 #if defined(CRYPTOPP_BIG_ENDIAN)
    return (uint32x4_p)VectorLoad(src);
 #else
-    const uint8x16_p data = (uint8x16_p)VectorLoad(src);
+	return (uint32x4_p)Reverse(VectorLoad(src));
    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
    return (uint32x4_p)vec_perm(data, data, mask);
 #endif
 }
-inline void VectorStore(const uint32x4_p data, byte dest[16])
+template<class T>
 inline void VectorStore(const T& data, byte dest[16])
 {
 #if defined(CRYPTOPP_LITTLE_ENDIAN)
    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
    const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask);
 #else
    const uint8x16_p t1 = (uint8x16_p)data;
 #endif
    if (IsAlignedOn(dest, 16))
    {
-        vec_st(t1, 0,  dest);
+        vec_st((uint8x16_p)data, 0,  dest);
    }
    else
    {
        // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
-        const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest));
+        uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, dest));
-        vec_ste((uint8x16_p) t2,  0, (unsigned char*) dest);
+        vec_ste((uint8x16_p) perm,  0, (unsigned char*) dest);
-        vec_ste((uint16x8_p) t2,  1, (unsigned short*)dest);
+        vec_ste((uint16x8_p) perm,  1, (unsigned short*)dest);
-        vec_ste((uint32x4_p) t2,  3, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm,  3, (unsigned int*)  dest);
-        vec_ste((uint32x4_p) t2,  4, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm,  4, (unsigned int*)  dest);
-        vec_ste((uint32x4_p) t2,  8, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm,  8, (unsigned int*)  dest);
-        vec_ste((uint32x4_p) t2, 12, (unsigned int*)  dest);
+        vec_ste((uint32x4_p) perm, 12, (unsigned int*)  dest);
-        vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest);
+        vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
-        vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest);
+        vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
    }
 }
@ -521,8 +541,7 @@ inline void VectorStoreBE(const T& src, uint8_t dest[16])
 #if defined(CRYPTOPP_BIG_ENDIAN)
    VectorStore(src, dest);
 #else
-    const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	VectorStore(Reverse(src), dest);
    VectorStore(vec_perm(src, src, mask), dest);
 #endif
 }