Add XLC 12 loads and stores for AIX (PR #907)

Add XLC 12 loads and stores for AIX
This commit is contained in:
Jeffrey Walton 2019-10-26 22:11:49 -04:00 committed by GitHub
parent 1bfb8760bb
commit fa39314b7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 102 additions and 45 deletions

View File

@ -641,8 +641,6 @@ ifeq ($(DETECT_FEATURES),1)
AES_FLAG = $(POWER8_FLAG)
ARIA_FLAG = $(POWER8_FLAG)
BLAKE2B_FLAG = $(POWER8_FLAG)
BLAKE2S_FLAG = $(POWER8_FLAG)
CHACHA_FLAG = $(POWER8_FLAG)
CHAM_FLAG = $(POWER8_FLAG)
CRC_FLAG = $(POWER8_FLAG)
GCM_FLAG = $(POWER8_FLAG)
@ -688,6 +686,11 @@ ifeq ($(DETECT_FEATURES),1)
endif
endif
ifneq ($(POWER7_FLAG),)
BLAKE2S_FLAG = $(POWER7_FLAG)
CHACHA_FLAG = $(POWER7_FLAG)
endif
#####################################################################
# Looking for an Altivec option
@ -727,13 +730,21 @@ ifeq ($(DETECT_FEATURES),1)
#####################################################################
# Fixups for algorithms that can drop to a lower ISA, if needed
# Drop to Power4 if Power8 not available
ifeq ($(POWER8_FLAG),)
ifneq ($(ALTIVEC_FLAG),)
# Drop to Altivec if higher Power is not available
ifneq ($(ALTIVEC_FLAG),)
ifeq ($(BLAKE2S_FLAG),)
BLAKE2S_FLAG = $(ALTIVEC_FLAG)
endif
ifeq ($(CHACHA_FLAG),)
CHACHA_FLAG = $(ALTIVEC_FLAG)
endif
ifeq ($(GCM_FLAG),)
GCM_FLAG = $(ALTIVEC_FLAG)
endif
ifeq ($(SIMON64_FLAG),)
SIMON64_FLAG = $(ALTIVEC_FLAG)
endif
ifeq ($(SPECK64_FLAG),)
SPECK64_FLAG = $(ALTIVEC_FLAG)
endif
endif

View File

@ -6,15 +6,36 @@
// XL C++ on AIX does not define VSX and does not
// provide an option to set it. We have to set it
// for the code below. This define must stay in
// sync with the define in ppc_simd.h
// sync with the define in test_ppc_power7.cxx.
#if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
# define __VSX__ 1
#endif
// XL C++ v12 on AIX uses vec_xlw4 and vec_xstw4,
// http://www.ibm.com/support/docview.wss?uid=swg27024210.
// This define must stay in sync with the define
// in ppc_simd.h.
#if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
# define XLC_VEC_XLW4 1
# define XLC_VEC_XSTW4 1
#endif
#include <altivec.h>
int main(int argc, char* argv[])
{
#if defined(_ARCH_PWR7) && defined(__VSX__)
#if defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
// PWR7
__vector unsigned int a = {1,2,3,4};
__vector unsigned int b = vec_ld(0, (unsigned int*)argv[0]);
__vector unsigned int c = vec_xor(a, b);
// VSX
__vector unsigned int x = {5,6,7,8};
__vector unsigned int y = vec_xlw4(0, (unsigned int*)argv[0]);
__vector unsigned int z = vec_xor(x, y);
__vector unsigned long long xx = {1,2};
__vector unsigned long long yy = (__vector unsigned long long)y;
#elif defined(_ARCH_PWR7) && defined(__VSX__)
// PWR7
__vector unsigned int a = {1,2,3,4};
__vector unsigned int b = vec_ld(0, (unsigned int*)argv[0]);

View File

@ -5,7 +5,7 @@
//
// The BLAKE2b and BLAKE2s numbers are consistent with the BLAKE2 team's
// numbers. However, we have an Altivec/POWER7 implementation of BLAKE2s,
// and a POWER8 implementation of BLAKE2b (BLAKE2 is missing them). The
// and a POWER8 implementation of BLAKE2b (BLAKE2 team is missing them).
// Altivec/POWER7 code is about 2x faster than C++ when using GCC 5.0 or
// above. The POWER8 code is about 2.5x faster than C++ when using GCC 5.0
// or above. If you use GCC 4.0 (PowerMac) or GCC 4.8 (GCC Compile Farm)
@ -181,8 +181,8 @@ extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state);
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state);
#endif
#if CRYPTOPP_POWER8_AVAILABLE
extern void BLAKE2_Compress32_POWER8(const byte* input, BLAKE2s_State& state);
#if CRYPTOPP_POWER7_AVAILABLE
extern void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state);
#elif CRYPTOPP_ALTIVEC_AVAILABLE
extern void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state);
#endif
@ -243,9 +243,9 @@ unsigned int BLAKE2s::OptimalDataAlignment() const
return 4;
else
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return 16;
#if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower7())
return 4;
else
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
@ -267,9 +267,9 @@ std::string BLAKE2s::AlgorithmProvider() const
return "NEON";
else
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return "Power8";
#if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower7())
return "Power7";
else
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
@ -700,10 +700,10 @@ void BLAKE2s::Compress(const byte *input)
return BLAKE2_Compress32_NEON(input, m_state);
}
#endif
#if CRYPTOPP_POWER8_AVAILABLE
if(HasPower8())
#if CRYPTOPP_POWER7_AVAILABLE
if(HasPower7())
{
return BLAKE2_Compress32_POWER8(input, m_state);
return BLAKE2_Compress32_POWER7(input, m_state);
}
#elif CRYPTOPP_ALTIVEC_AVAILABLE
if(HasAltivec())

View File

@ -8,10 +8,10 @@
// appropriate instructions sets in some build configurations.
// The BLAKE2b and BLAKE2s numbers are consistent with the BLAKE2 team's
// numbers. However, we have an Altivec/POWER8 implementation of BLAKE2s,
// and a POWER8 implementation of BLAKE2b (BLAKE2 is missing them). The
// Altivec/POWER8 code is about 2x faster than C++ when using GCC 5.0 or
// above. The POWER8 code is about 2.5x faster than C++ when using GCC 5.0
// numbers. However, we have an Altivec/POWER7 implementation of BLAKE2s,
// and a POWER7 implementation of BLAKE2b (BLAKE2 is missing them). The
// Altivec/POWER7 code is about 2x faster than C++ when using GCC 5.0 or
// above. The POWER7 code is about 2.5x faster than C++ when using GCC 5.0
// or above. If you use GCC 4.0 (PowerMac) or GCC 4.8 (GCC Compile Farm)
// then the PowerPC code will be slower than C++. Be sure to use GCC 5.0
// or above for PowerPC builds or disable Altivec for BLAKE2b and BLAKE2s
@ -38,7 +38,7 @@
// https://github.com/weidai11/cryptopp/issues/743
#if defined(__xlC__) && (__xlC__ < 0x0d01)
# define CRYPTOPP_DISABLE_ALTIVEC 1
# undef CRYPTOPP_POWER8_AVAILABLE
# undef CRYPTOPP_POWER7_AVAILABLE
# undef CRYPTOPP_ALTIVEC_AVAILABLE
#endif
@ -697,7 +697,7 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
}
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
inline uint32x4_p VecLoad32(const void* p)
{
@ -868,10 +868,10 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b,
return VecPermute(a, c, mask);
}
// BLAKE2_Compress32_CORE will use either POWER8 or ALTIVEC,
// BLAKE2_Compress32_CORE will use either POWER7 or ALTIVEC,
// depending on the flags used to compile this source file. The
// abstractions are handled in VecLoad, VecStore and friends. In
// the future we may provide both POWER8 or ALTIVEC at the same
// the future we may provide both POWER7 or ALTIVEC at the same
// time to better support distros.
void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
{
@ -1020,11 +1020,11 @@ void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)));
VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)));
}
#endif // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
#endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
#if (CRYPTOPP_POWER8_AVAILABLE)
#if (CRYPTOPP_POWER7_AVAILABLE)
void BLAKE2_Compress32_POWER8(const byte* input, BLAKE2s_State& state)
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
{
BLAKE2_Compress32_CORE(input, state);
}

View File

@ -28,8 +28,8 @@ extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input,
extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#if (CRYPTOPP_POWER7_AVAILABLE)
extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
extern void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif
@ -153,13 +153,13 @@ void ChaCha_OperateKeystream(KeystreamOperation operation,
}
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
#if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower7())
{
while (iterationCount >= 4 && MultiBlockSafe(state[12], 4))
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER8(state, xorInput ? input : NULLPTR, output, rounds);
ChaCha_OperateKeystream_POWER7(state, xorInput ? input : NULLPTR, output, rounds);
// MultiBlockSafe avoids overflow on the counter words
state[12] += 4;
@ -267,9 +267,9 @@ std::string ChaCha_AlgorithmProvider()
return "NEON";
else
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return "Power8";
#if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower7())
return "Power7";
else
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())

View File

@ -211,7 +211,7 @@ inline __m128i RotateLeft<16>(const __m128i val)
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
// ChaCha_OperateKeystream_POWER8 is optimized for POWER7. However, Altivec
// ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec
// is supported by using vec_ld and vec_st, and using a composite VecAdd
// that supports 64-bit element adds. vec_ld and vec_st add significant
// overhead when memory is not aligned. Despite the drawbacks Altivec
@ -825,7 +825,7 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE
#if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
// ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC,
// depending on the flags used to compile this source file. The
@ -1094,11 +1094,11 @@ inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input,
VecStore32LE(output + 15*16, r3_3);
}
#endif // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
#endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
#if (CRYPTOPP_POWER8_AVAILABLE)
#if (CRYPTOPP_POWER7_AVAILABLE)
void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
ChaCha_OperateKeystream_CORE(state, input, output, rounds);
}

View File

@ -64,7 +64,11 @@ bool CPU_ProbePower7()
// POWER7 added unaligned loads and store operations
byte b1[19] = {255, 255, 255, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, b2[17];
#if defined(_ARCH_PWR7) && defined(__VSX__)
// See comments in ppc_simd.h for some of these defines.
#if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
vec_xstw4(vec_xlw4(0, (unsigned int*)(b1+3)), 0, (unsigned int*)(b2+1));
result = (0 == std::memcmp(b1+3, b2+1, 16));
#elif defined(_ARCH_PWR7) && defined(__VSX__)
vec_xst(vec_xl(0, (unsigned int*)(b1+3)), 0, (unsigned int*)(b2+1));
result = (0 == std::memcmp(b1+3, b2+1, 16));
#else

View File

@ -87,11 +87,20 @@
// XL C++ on AIX does not define VSX and does not
// provide an option to set it. We have to set it
// for the code below. This define must stay in
// sync with the define in test_ppc_power7.cxx
// sync with the define in test_ppc_power7.cxx.
#if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
# define __VSX__ 1
#endif
// XL C++ v12 on AIX uses vec_xlw4 and vec_xstw4,
// http://www.ibm.com/support/docview.wss?uid=swg27024210.
// This define must stay in sync with the define
// in test_ppc_power7.cxx.
#if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
# define XLC_VEC_XLW4 1
# define XLC_VEC_XSTW4 1
#endif
// XL C++ on AIX does not define CRYPTO and does not
// provide an option to set it. We have to set it
// for the code below. This define must stay in
@ -280,6 +289,8 @@ inline uint32x4_p VecLoad(const byte src[16])
#if defined(_ARCH_PWR9)
// ISA 3.0 provides vec_xl for short* and char*
return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
#elif defined(_ARCH_PWR7) && defined(__VSX__)
// ISA 2.06 provides vec_xl, but it lacks short* and char*
return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
@ -305,6 +316,8 @@ inline uint32x4_p VecLoad(int off, const byte src[16])
#if defined(_ARCH_PWR9)
// ISA 3.0 provides vec_xl for short* and char*
return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
#elif defined(_ARCH_PWR7) && defined(__VSX__)
// ISA 2.06 provides vec_xl, but it lacks short* and char*
return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
@ -400,6 +413,8 @@ inline uint32x4_p VecLoadAligned(const byte src[16])
#if defined(_ARCH_PWR9)
// ISA 3.0 provides vec_xl for short* and char*
return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
#elif defined(_ARCH_PWR7) && defined(__VSX__)
// ISA 2.06 provides vec_xl, but it lacks short* and char*
return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
@ -423,6 +438,8 @@ inline uint32x4_p VecLoadAligned(int off, const byte src[16])
#if defined(_ARCH_PWR9)
// ISA 3.0 provides vec_xl for short* and char*
return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
#elif defined(_ARCH_PWR7) && defined(__VSX__)
// ISA 2.06 provides vec_xl, but it lacks short* and char*
return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
@ -580,6 +597,8 @@ inline void VecStore(const T data, byte dest[16])
#if defined(_ARCH_PWR9)
// ISA 3.0 provides vec_xl for short* and char*
vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XSTW4)
vec_xstw4((uint32x4_p)data, off, NCONST_V32_CAST(dest));
#elif defined(_ARCH_PWR7) && defined(__VSX__)
// ISA 2.06 provides vec_xl, but it lacks short* and char*
vec_xst((uint32x4_p)data, off, NCONST_V32_CAST(dest));
@ -608,6 +627,8 @@ inline void VecStore(const T data, int off, byte dest[16])
#if defined(_ARCH_PWR9)
// ISA 3.0 provides vec_xl for short* and char*
vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XSTW4)
vec_xstw4((uint32x4_p)data, off, NCONST_V32_CAST(dest));
#elif defined(_ARCH_PWR7) && defined(__VSX__)
// ISA 2.06 provides vec_xl, but it lacks short* and char*
vec_xst((uint32x4_p)data, off, NCONST_V32_CAST(dest));