Add XLC 12 loads and stores for AIX (PR #907)

Add XLC 12 loads and stores for AIX
2025-02-11 07:45:17 +00:00 · 2019-10-26 22:11:49 -04:00 · 2019-10-26 22:11:49 -04:00 · fa39314b7a
commit fa39314b7a
parent 1bfb8760bb
8 changed files with 102 additions and 45 deletions
--- a/21
+++ b/21
@ -641,8 +641,6 @@ ifeq ($(DETECT_FEATURES),1)
    AES_FLAG = $(POWER8_FLAG)
    ARIA_FLAG = $(POWER8_FLAG)
    BLAKE2B_FLAG = $(POWER8_FLAG)
-    BLAKE2S_FLAG = $(POWER8_FLAG)
-    CHACHA_FLAG = $(POWER8_FLAG)
    CHAM_FLAG = $(POWER8_FLAG)
    CRC_FLAG = $(POWER8_FLAG)
    GCM_FLAG = $(POWER8_FLAG)
@ -688,6 +686,11 @@ ifeq ($(DETECT_FEATURES),1)
    endif
  endif

+  ifneq ($(POWER7_FLAG),)
+    BLAKE2S_FLAG = $(POWER7_FLAG)
+    CHACHA_FLAG = $(POWER7_FLAG)
+  endif
+
  #####################################################################
  # Looking for an Altivec option

@ -727,13 +730,21 @@ ifeq ($(DETECT_FEATURES),1)
  #####################################################################
  # Fixups for algorithms that can drop to a lower ISA, if needed

-  # Drop to Power4 if Power8 not available
-  ifeq ($(POWER8_FLAG),)
-    ifneq ($(ALTIVEC_FLAG),)
+  # Drop to Altivec if higher Power is not available
+  ifneq ($(ALTIVEC_FLAG),)
+    ifeq ($(BLAKE2S_FLAG),)
      BLAKE2S_FLAG = $(ALTIVEC_FLAG)
+    endif
+    ifeq ($(CHACHA_FLAG),)
      CHACHA_FLAG = $(ALTIVEC_FLAG)
+    endif
+    ifeq ($(GCM_FLAG),)
      GCM_FLAG = $(ALTIVEC_FLAG)
+    endif
+    ifeq ($(SIMON64_FLAG),)
      SIMON64_FLAG = $(ALTIVEC_FLAG)
+    endif
+    ifeq ($(SPECK64_FLAG),)
      SPECK64_FLAG = $(ALTIVEC_FLAG)
    endif
  endif
--- a/TestPrograms/test_ppc_power7.cxx
+++ b/TestPrograms/test_ppc_power7.cxx
@ -6,15 +6,36 @@
 // XL C++ on AIX does not define VSX and does not
 // provide an option to set it. We have to set it
 // for the code below. This define must stay in
-// sync with the define in ppc_simd.h
+// sync with the define in test_ppc_power7.cxx.
 #if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
 # define __VSX__ 1
 #endif

+// XL C++ v12 on AIX uses vec_xlw4 and vec_xstw4,
+// http://www.ibm.com/support/docview.wss?uid=swg27024210.
+// This define must stay in sync with the define
+// in ppc_simd.h.
+#if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
+# define XLC_VEC_XLW4 1
+# define XLC_VEC_XSTW4 1
+#endif
+
 #include <altivec.h>
 int main(int argc, char* argv[])
 {
-#if defined(_ARCH_PWR7) && defined(__VSX__)
+#if defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
+    // PWR7
+    __vector unsigned int a = {1,2,3,4};
+    __vector unsigned int b = vec_ld(0, (unsigned int*)argv[0]);
+    __vector unsigned int c = vec_xor(a, b);
+
+    // VSX
+    __vector unsigned int x = {5,6,7,8};
+    __vector unsigned int y = vec_xlw4(0, (unsigned int*)argv[0]);
+    __vector unsigned int z = vec_xor(x, y);
+    __vector unsigned long long xx = {1,2};
+    __vector unsigned long long yy = (__vector unsigned long long)y;
+#elif defined(_ARCH_PWR7) && defined(__VSX__)
    // PWR7
    __vector unsigned int a = {1,2,3,4};
    __vector unsigned int b = vec_ld(0, (unsigned int*)argv[0]);
--- a/blake2.cpp
+++ b/blake2.cpp
@ -5,7 +5,7 @@
 //
 // The BLAKE2b and BLAKE2s numbers are consistent with the BLAKE2 team's
 // numbers. However, we have an Altivec/POWER7 implementation of BLAKE2s,
-// and a POWER8 implementation of BLAKE2b (BLAKE2 is missing them). The
+// and a POWER8 implementation of BLAKE2b (BLAKE2 team is missing them).
 // Altivec/POWER7 code is about 2x faster than C++ when using GCC 5.0 or
 // above. The POWER8 code is about 2.5x faster than C++ when using GCC 5.0
 // or above. If you use GCC 4.0 (PowerMac) or GCC 4.8 (GCC Compile Farm)
@ -181,8 +181,8 @@ extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state);
 extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state);
 #endif

-#if CRYPTOPP_POWER8_AVAILABLE
-extern void BLAKE2_Compress32_POWER8(const byte* input, BLAKE2s_State& state);
+#if CRYPTOPP_POWER7_AVAILABLE
+extern void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state);
 #elif CRYPTOPP_ALTIVEC_AVAILABLE
 extern void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state);
 #endif
@ -243,9 +243,9 @@ unsigned int BLAKE2s::OptimalDataAlignment() const
        return 4;
    else
 #endif
-#if (CRYPTOPP_POWER8_AVAILABLE)
-    if (HasPower8())
-        return 16;
+#if (CRYPTOPP_POWER7_AVAILABLE)
+    if (HasPower7())
+        return 4;
    else
 #elif (CRYPTOPP_ALTIVEC_AVAILABLE)
    if (HasAltivec())
@ -267,9 +267,9 @@ std::string BLAKE2s::AlgorithmProvider() const
        return "NEON";
    else
 #endif
-#if (CRYPTOPP_POWER8_AVAILABLE)
-    if (HasPower8())
-        return "Power8";
+#if (CRYPTOPP_POWER7_AVAILABLE)
+    if (HasPower7())
+        return "Power7";
    else
 #elif (CRYPTOPP_ALTIVEC_AVAILABLE)
    if (HasAltivec())
@ -700,10 +700,10 @@ void BLAKE2s::Compress(const byte *input)
        return BLAKE2_Compress32_NEON(input, m_state);
    }
 #endif
-#if CRYPTOPP_POWER8_AVAILABLE
-    if(HasPower8())
+#if CRYPTOPP_POWER7_AVAILABLE
+    if(HasPower7())
    {
-        return BLAKE2_Compress32_POWER8(input, m_state);
+        return BLAKE2_Compress32_POWER7(input, m_state);
    }
 #elif CRYPTOPP_ALTIVEC_AVAILABLE
    if(HasAltivec())
--- a/blake2s_simd.cpp
+++ b/blake2s_simd.cpp
@ -8,10 +8,10 @@
 //    appropriate instructions sets in some build configurations.

 // The BLAKE2b and BLAKE2s numbers are consistent with the BLAKE2 team's
-// numbers. However, we have an Altivec/POWER8 implementation of BLAKE2s,
-// and a POWER8 implementation of BLAKE2b (BLAKE2 is missing them). The
-// Altivec/POWER8 code is about 2x faster than C++ when using GCC 5.0 or
-// above. The POWER8 code is about 2.5x faster than C++ when using GCC 5.0
+// numbers. However, we have an Altivec/POWER7 implementation of BLAKE2s,
+// and a POWER7 implementation of BLAKE2b (BLAKE2 is missing them). The
+// Altivec/POWER7 code is about 2x faster than C++ when using GCC 5.0 or
+// above. The POWER7 code is about 2.5x faster than C++ when using GCC 5.0
 // or above. If you use GCC 4.0 (PowerMac) or GCC 4.8 (GCC Compile Farm)
 // then the PowerPC code will be slower than C++. Be sure to use GCC 5.0
 // or above for PowerPC builds or disable Altivec for BLAKE2b and BLAKE2s
@ -38,7 +38,7 @@
 // https://github.com/weidai11/cryptopp/issues/743
 #if defined(__xlC__) && (__xlC__ < 0x0d01)
 # define CRYPTOPP_DISABLE_ALTIVEC 1
-# undef CRYPTOPP_POWER8_AVAILABLE
+# undef CRYPTOPP_POWER7_AVAILABLE
 # undef CRYPTOPP_ALTIVEC_AVAILABLE
 #endif

@ -697,7 +697,7 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
 }
 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE

-#if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)

 inline uint32x4_p VecLoad32(const void* p)
 {
@ -868,10 +868,10 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b,
    return VecPermute(a, c, mask);
 }

-// BLAKE2_Compress32_CORE will use either POWER8 or ALTIVEC,
+// BLAKE2_Compress32_CORE will use either POWER7 or ALTIVEC,
 // depending on the flags used to compile this source file. The
 // abstractions are handled in VecLoad, VecStore and friends. In
-// the future we may provide both POWER8 or ALTIVEC at the same
+// the future we may provide both POWER7 or ALTIVEC at the same
 // time to better support distros.
 void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
 {
@ -1020,11 +1020,11 @@ void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
    VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)));
    VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)));
 }
-#endif  // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
+#endif  // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE

-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)

-void BLAKE2_Compress32_POWER8(const byte* input, BLAKE2s_State& state)
+void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
 {
    BLAKE2_Compress32_CORE(input, state);
 }
--- a/chacha.cpp
+++ b/chacha.cpp
@ -28,8 +28,8 @@ extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input,
 extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #endif

-#if (CRYPTOPP_POWER8_AVAILABLE)
-extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
+#if (CRYPTOPP_POWER7_AVAILABLE)
+extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #elif (CRYPTOPP_ALTIVEC_AVAILABLE)
 extern void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #endif
@ -153,13 +153,13 @@ void ChaCha_OperateKeystream(KeystreamOperation operation,
        }
 #endif

-#if (CRYPTOPP_POWER8_AVAILABLE)
-        if (HasPower8())
+#if (CRYPTOPP_POWER7_AVAILABLE)
+        if (HasPower7())
        {
            while (iterationCount >= 4 && MultiBlockSafe(state[12], 4))
            {
                const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
-                ChaCha_OperateKeystream_POWER8(state, xorInput ? input : NULLPTR, output, rounds);
+                ChaCha_OperateKeystream_POWER7(state, xorInput ? input : NULLPTR, output, rounds);

                // MultiBlockSafe avoids overflow on the counter words
                state[12] += 4;
@ -267,9 +267,9 @@ std::string ChaCha_AlgorithmProvider()
        return "NEON";
    else
 #endif
-#if (CRYPTOPP_POWER8_AVAILABLE)
-    if (HasPower8())
-        return "Power8";
+#if (CRYPTOPP_POWER7_AVAILABLE)
+    if (HasPower7())
+        return "Power7";
    else
 #elif (CRYPTOPP_ALTIVEC_AVAILABLE)
    if (HasAltivec())
--- a/chacha_simd.cpp
+++ b/chacha_simd.cpp
@ -211,7 +211,7 @@ inline __m128i RotateLeft<16>(const __m128i val)

 #if (CRYPTOPP_ALTIVEC_AVAILABLE)

-// ChaCha_OperateKeystream_POWER8 is optimized for POWER7. However, Altivec
+// ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec
 // is supported by using vec_ld and vec_st, and using a composite VecAdd
 // that supports 64-bit element adds. vec_ld and vec_st add significant
 // overhead when memory is not aligned. Despite the drawbacks Altivec
@ -825,7 +825,7 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *

 #endif  // CRYPTOPP_SSE2_INTRIN_AVAILABLE

-#if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)

 // ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC,
 // depending on the flags used to compile this source file. The
@ -1094,11 +1094,11 @@ inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input,
    VecStore32LE(output + 15*16, r3_3);
 }

-#endif  // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
+#endif  // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE

-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)

-void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
+void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
 {
    ChaCha_OperateKeystream_CORE(state, input, output, rounds);
 }
--- a/ppc_power7.cpp
+++ b/ppc_power7.cpp
@ -64,7 +64,11 @@ bool CPU_ProbePower7()
        // POWER7 added unaligned loads and store operations
        byte b1[19] = {255, 255, 255, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, b2[17];

-        #if defined(_ARCH_PWR7) && defined(__VSX__)
+        // See comments in ppc_simd.h for some of these defines.
+        #if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
+            vec_xstw4(vec_xlw4(0, (unsigned int*)(b1+3)), 0, (unsigned int*)(b2+1));
+            result = (0 == std::memcmp(b1+3, b2+1, 16));
+        #elif defined(_ARCH_PWR7) && defined(__VSX__)
            vec_xst(vec_xl(0, (unsigned int*)(b1+3)), 0, (unsigned int*)(b2+1));
            result = (0 == std::memcmp(b1+3, b2+1, 16));
        #else
--- a/ppc_simd.h
+++ b/ppc_simd.h
@ -87,11 +87,20 @@
 // XL C++ on AIX does not define VSX and does not
 // provide an option to set it. We have to set it
 // for the code below. This define must stay in
-// sync with the define in test_ppc_power7.cxx
+// sync with the define in test_ppc_power7.cxx.
 #if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
 # define __VSX__ 1
 #endif

+// XL C++ v12 on AIX uses vec_xlw4 and vec_xstw4,
+// http://www.ibm.com/support/docview.wss?uid=swg27024210.
+// This define must stay in sync with the define
+// in test_ppc_power7.cxx.
+#if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
+# define XLC_VEC_XLW4 1
+# define XLC_VEC_XSTW4 1
+#endif
+
 // XL C++ on AIX does not define CRYPTO and does not
 // provide an option to set it. We have to set it
 // for the code below. This define must stay in
@ -280,6 +289,8 @@ inline uint32x4_p VecLoad(const byte src[16])
 #if defined(_ARCH_PWR9)
    // ISA 3.0 provides vec_xl for short* and char*
    return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
+#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
+    return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
 #elif defined(_ARCH_PWR7) && defined(__VSX__)
    // ISA 2.06 provides vec_xl, but it lacks short* and char*
    return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
@ -305,6 +316,8 @@ inline uint32x4_p VecLoad(int off, const byte src[16])
 #if defined(_ARCH_PWR9)
    // ISA 3.0 provides vec_xl for short* and char*
    return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
+#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
+    return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
 #elif defined(_ARCH_PWR7) && defined(__VSX__)
    // ISA 2.06 provides vec_xl, but it lacks short* and char*
    return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
@ -400,6 +413,8 @@ inline uint32x4_p VecLoadAligned(const byte src[16])
 #if defined(_ARCH_PWR9)
    // ISA 3.0 provides vec_xl for short* and char*
    return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
+#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
+    return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
 #elif defined(_ARCH_PWR7) && defined(__VSX__)
    // ISA 2.06 provides vec_xl, but it lacks short* and char*
    return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
@ -423,6 +438,8 @@ inline uint32x4_p VecLoadAligned(int off, const byte src[16])
 #if defined(_ARCH_PWR9)
    // ISA 3.0 provides vec_xl for short* and char*
    return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
+#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
+    return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
 #elif defined(_ARCH_PWR7) && defined(__VSX__)
    // ISA 2.06 provides vec_xl, but it lacks short* and char*
    return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
@ -580,6 +597,8 @@ inline void VecStore(const T data, byte dest[16])
 #if defined(_ARCH_PWR9)
    // ISA 3.0 provides vec_xl for short* and char*
    vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
+#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XSTW4)
+    vec_xstw4((uint32x4_p)data, off, NCONST_V32_CAST(dest));
 #elif defined(_ARCH_PWR7) && defined(__VSX__)
    // ISA 2.06 provides vec_xl, but it lacks short* and char*
    vec_xst((uint32x4_p)data, off, NCONST_V32_CAST(dest));
@ -608,6 +627,8 @@ inline void VecStore(const T data, int off, byte dest[16])
 #if defined(_ARCH_PWR9)
    // ISA 3.0 provides vec_xl for short* and char*
    vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
+#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XSTW4)
+    vec_xstw4((uint32x4_p)data, off, NCONST_V32_CAST(dest));
 #elif defined(_ARCH_PWR7) && defined(__VSX__)
    // ISA 2.06 provides vec_xl, but it lacks short* and char*
    vec_xst((uint32x4_p)data, off, NCONST_V32_CAST(dest));