mirror of
https://github.com/shadps4-emu/ext-cryptopp.git
synced 2025-02-11 07:45:17 +00:00
Add XLC 12 loads and stores for AIX (PR #907)
Add XLC 12 loads and stores for AIX
This commit is contained in:
parent
1bfb8760bb
commit
fa39314b7a
21
GNUmakefile
21
GNUmakefile
@ -641,8 +641,6 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
AES_FLAG = $(POWER8_FLAG)
|
||||
ARIA_FLAG = $(POWER8_FLAG)
|
||||
BLAKE2B_FLAG = $(POWER8_FLAG)
|
||||
BLAKE2S_FLAG = $(POWER8_FLAG)
|
||||
CHACHA_FLAG = $(POWER8_FLAG)
|
||||
CHAM_FLAG = $(POWER8_FLAG)
|
||||
CRC_FLAG = $(POWER8_FLAG)
|
||||
GCM_FLAG = $(POWER8_FLAG)
|
||||
@ -688,6 +686,11 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(POWER7_FLAG),)
|
||||
BLAKE2S_FLAG = $(POWER7_FLAG)
|
||||
CHACHA_FLAG = $(POWER7_FLAG)
|
||||
endif
|
||||
|
||||
#####################################################################
|
||||
# Looking for an Altivec option
|
||||
|
||||
@ -727,13 +730,21 @@ ifeq ($(DETECT_FEATURES),1)
|
||||
#####################################################################
|
||||
# Fixups for algorithms that can drop to a lower ISA, if needed
|
||||
|
||||
# Drop to Power4 if Power8 not available
|
||||
ifeq ($(POWER8_FLAG),)
|
||||
ifneq ($(ALTIVEC_FLAG),)
|
||||
# Drop to Altivec if higher Power is not available
|
||||
ifneq ($(ALTIVEC_FLAG),)
|
||||
ifeq ($(BLAKE2S_FLAG),)
|
||||
BLAKE2S_FLAG = $(ALTIVEC_FLAG)
|
||||
endif
|
||||
ifeq ($(CHACHA_FLAG),)
|
||||
CHACHA_FLAG = $(ALTIVEC_FLAG)
|
||||
endif
|
||||
ifeq ($(GCM_FLAG),)
|
||||
GCM_FLAG = $(ALTIVEC_FLAG)
|
||||
endif
|
||||
ifeq ($(SIMON64_FLAG),)
|
||||
SIMON64_FLAG = $(ALTIVEC_FLAG)
|
||||
endif
|
||||
ifeq ($(SPECK64_FLAG),)
|
||||
SPECK64_FLAG = $(ALTIVEC_FLAG)
|
||||
endif
|
||||
endif
|
||||
|
@ -6,15 +6,36 @@
|
||||
// XL C++ on AIX does not define VSX and does not
|
||||
// provide an option to set it. We have to set it
|
||||
// for the code below. This define must stay in
|
||||
// sync with the define in ppc_simd.h
|
||||
// sync with the define in test_ppc_power7.cxx.
|
||||
#if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
|
||||
# define __VSX__ 1
|
||||
#endif
|
||||
|
||||
// XL C++ v12 on AIX uses vec_xlw4 and vec_xstw4,
|
||||
// http://www.ibm.com/support/docview.wss?uid=swg27024210.
|
||||
// This define must stay in sync with the define
|
||||
// in ppc_simd.h.
|
||||
#if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
|
||||
# define XLC_VEC_XLW4 1
|
||||
# define XLC_VEC_XSTW4 1
|
||||
#endif
|
||||
|
||||
#include <altivec.h>
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
#if defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
#if defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
|
||||
// PWR7
|
||||
__vector unsigned int a = {1,2,3,4};
|
||||
__vector unsigned int b = vec_ld(0, (unsigned int*)argv[0]);
|
||||
__vector unsigned int c = vec_xor(a, b);
|
||||
|
||||
// VSX
|
||||
__vector unsigned int x = {5,6,7,8};
|
||||
__vector unsigned int y = vec_xlw4(0, (unsigned int*)argv[0]);
|
||||
__vector unsigned int z = vec_xor(x, y);
|
||||
__vector unsigned long long xx = {1,2};
|
||||
__vector unsigned long long yy = (__vector unsigned long long)y;
|
||||
#elif defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
// PWR7
|
||||
__vector unsigned int a = {1,2,3,4};
|
||||
__vector unsigned int b = vec_ld(0, (unsigned int*)argv[0]);
|
||||
|
24
blake2.cpp
24
blake2.cpp
@ -5,7 +5,7 @@
|
||||
//
|
||||
// The BLAKE2b and BLAKE2s numbers are consistent with the BLAKE2 team's
|
||||
// numbers. However, we have an Altivec/POWER7 implementation of BLAKE2s,
|
||||
// and a POWER8 implementation of BLAKE2b (BLAKE2 is missing them). The
|
||||
// and a POWER8 implementation of BLAKE2b (BLAKE2 team is missing them).
|
||||
// Altivec/POWER7 code is about 2x faster than C++ when using GCC 5.0 or
|
||||
// above. The POWER8 code is about 2.5x faster than C++ when using GCC 5.0
|
||||
// or above. If you use GCC 4.0 (PowerMac) or GCC 4.8 (GCC Compile Farm)
|
||||
@ -181,8 +181,8 @@ extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state);
|
||||
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state);
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_POWER8_AVAILABLE
|
||||
extern void BLAKE2_Compress32_POWER8(const byte* input, BLAKE2s_State& state);
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
extern void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state);
|
||||
#elif CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
extern void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state);
|
||||
#endif
|
||||
@ -243,9 +243,9 @@ unsigned int BLAKE2s::OptimalDataAlignment() const
|
||||
return 4;
|
||||
else
|
||||
#endif
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||
if (HasPower8())
|
||||
return 16;
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
if (HasPower7())
|
||||
return 4;
|
||||
else
|
||||
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
@ -267,9 +267,9 @@ std::string BLAKE2s::AlgorithmProvider() const
|
||||
return "NEON";
|
||||
else
|
||||
#endif
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||
if (HasPower8())
|
||||
return "Power8";
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
if (HasPower7())
|
||||
return "Power7";
|
||||
else
|
||||
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
@ -700,10 +700,10 @@ void BLAKE2s::Compress(const byte *input)
|
||||
return BLAKE2_Compress32_NEON(input, m_state);
|
||||
}
|
||||
#endif
|
||||
#if CRYPTOPP_POWER8_AVAILABLE
|
||||
if(HasPower8())
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
if(HasPower7())
|
||||
{
|
||||
return BLAKE2_Compress32_POWER8(input, m_state);
|
||||
return BLAKE2_Compress32_POWER7(input, m_state);
|
||||
}
|
||||
#elif CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
if(HasAltivec())
|
||||
|
@ -8,10 +8,10 @@
|
||||
// appropriate instructions sets in some build configurations.
|
||||
|
||||
// The BLAKE2b and BLAKE2s numbers are consistent with the BLAKE2 team's
|
||||
// numbers. However, we have an Altivec/POWER8 implementation of BLAKE2s,
|
||||
// and a POWER8 implementation of BLAKE2b (BLAKE2 is missing them). The
|
||||
// Altivec/POWER8 code is about 2x faster than C++ when using GCC 5.0 or
|
||||
// above. The POWER8 code is about 2.5x faster than C++ when using GCC 5.0
|
||||
// numbers. However, we have an Altivec/POWER7 implementation of BLAKE2s,
|
||||
// and a POWER7 implementation of BLAKE2b (BLAKE2 is missing them). The
|
||||
// Altivec/POWER7 code is about 2x faster than C++ when using GCC 5.0 or
|
||||
// above. The POWER7 code is about 2.5x faster than C++ when using GCC 5.0
|
||||
// or above. If you use GCC 4.0 (PowerMac) or GCC 4.8 (GCC Compile Farm)
|
||||
// then the PowerPC code will be slower than C++. Be sure to use GCC 5.0
|
||||
// or above for PowerPC builds or disable Altivec for BLAKE2b and BLAKE2s
|
||||
@ -38,7 +38,7 @@
|
||||
// https://github.com/weidai11/cryptopp/issues/743
|
||||
#if defined(__xlC__) && (__xlC__ < 0x0d01)
|
||||
# define CRYPTOPP_DISABLE_ALTIVEC 1
|
||||
# undef CRYPTOPP_POWER8_AVAILABLE
|
||||
# undef CRYPTOPP_POWER7_AVAILABLE
|
||||
# undef CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
#endif
|
||||
|
||||
@ -697,7 +697,7 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
|
||||
}
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
|
||||
inline uint32x4_p VecLoad32(const void* p)
|
||||
{
|
||||
@ -868,10 +868,10 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b,
|
||||
return VecPermute(a, c, mask);
|
||||
}
|
||||
|
||||
// BLAKE2_Compress32_CORE will use either POWER8 or ALTIVEC,
|
||||
// BLAKE2_Compress32_CORE will use either POWER7 or ALTIVEC,
|
||||
// depending on the flags used to compile this source file. The
|
||||
// abstractions are handled in VecLoad, VecStore and friends. In
|
||||
// the future we may provide both POWER8 or ALTIVEC at the same
|
||||
// the future we may provide both POWER7 or ALTIVEC at the same
|
||||
// time to better support distros.
|
||||
void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
|
||||
{
|
||||
@ -1020,11 +1020,11 @@ void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
|
||||
VecStore32LE(state.h()+0, VecXor(ff0, VecXor(row1, row3)));
|
||||
VecStore32LE(state.h()+4, VecXor(ff1, VecXor(row2, row4)));
|
||||
}
|
||||
#endif // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
#endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
|
||||
void BLAKE2_Compress32_POWER8(const byte* input, BLAKE2s_State& state)
|
||||
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
||||
{
|
||||
BLAKE2_Compress32_CORE(input, state);
|
||||
}
|
||||
|
16
chacha.cpp
16
chacha.cpp
@ -28,8 +28,8 @@ extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input,
|
||||
extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||
extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
extern void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||
#endif
|
||||
@ -153,13 +153,13 @@ void ChaCha_OperateKeystream(KeystreamOperation operation,
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||
if (HasPower8())
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
if (HasPower7())
|
||||
{
|
||||
while (iterationCount >= 4 && MultiBlockSafe(state[12], 4))
|
||||
{
|
||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||
ChaCha_OperateKeystream_POWER8(state, xorInput ? input : NULLPTR, output, rounds);
|
||||
ChaCha_OperateKeystream_POWER7(state, xorInput ? input : NULLPTR, output, rounds);
|
||||
|
||||
// MultiBlockSafe avoids overflow on the counter words
|
||||
state[12] += 4;
|
||||
@ -267,9 +267,9 @@ std::string ChaCha_AlgorithmProvider()
|
||||
return "NEON";
|
||||
else
|
||||
#endif
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||
if (HasPower8())
|
||||
return "Power8";
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
if (HasPower7())
|
||||
return "Power7";
|
||||
else
|
||||
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
|
@ -211,7 +211,7 @@ inline __m128i RotateLeft<16>(const __m128i val)
|
||||
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
|
||||
// ChaCha_OperateKeystream_POWER8 is optimized for POWER7. However, Altivec
|
||||
// ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec
|
||||
// is supported by using vec_ld and vec_st, and using a composite VecAdd
|
||||
// that supports 64-bit element adds. vec_ld and vec_st add significant
|
||||
// overhead when memory is not aligned. Despite the drawbacks Altivec
|
||||
@ -825,7 +825,7 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
|
||||
|
||||
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE
|
||||
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
|
||||
// ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC,
|
||||
// depending on the flags used to compile this source file. The
|
||||
@ -1094,11 +1094,11 @@ inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input,
|
||||
VecStore32LE(output + 15*16, r3_3);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
#endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
|
||||
void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||
{
|
||||
ChaCha_OperateKeystream_CORE(state, input, output, rounds);
|
||||
}
|
||||
|
@ -64,7 +64,11 @@ bool CPU_ProbePower7()
|
||||
// POWER7 added unaligned loads and store operations
|
||||
byte b1[19] = {255, 255, 255, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, b2[17];
|
||||
|
||||
#if defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
// See comments in ppc_simd.h for some of these defines.
|
||||
#if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
|
||||
vec_xstw4(vec_xlw4(0, (unsigned int*)(b1+3)), 0, (unsigned int*)(b2+1));
|
||||
result = (0 == std::memcmp(b1+3, b2+1, 16));
|
||||
#elif defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
vec_xst(vec_xl(0, (unsigned int*)(b1+3)), 0, (unsigned int*)(b2+1));
|
||||
result = (0 == std::memcmp(b1+3, b2+1, 16));
|
||||
#else
|
||||
|
23
ppc_simd.h
23
ppc_simd.h
@ -87,11 +87,20 @@
|
||||
// XL C++ on AIX does not define VSX and does not
|
||||
// provide an option to set it. We have to set it
|
||||
// for the code below. This define must stay in
|
||||
// sync with the define in test_ppc_power7.cxx
|
||||
// sync with the define in test_ppc_power7.cxx.
|
||||
#if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
|
||||
# define __VSX__ 1
|
||||
#endif
|
||||
|
||||
// XL C++ v12 on AIX uses vec_xlw4 and vec_xstw4,
|
||||
// http://www.ibm.com/support/docview.wss?uid=swg27024210.
|
||||
// This define must stay in sync with the define
|
||||
// in test_ppc_power7.cxx.
|
||||
#if defined(_AIX) && defined(_ARCH_PWR7) && ((__xlC__ & 0xff00) == 0x0c00)
|
||||
# define XLC_VEC_XLW4 1
|
||||
# define XLC_VEC_XSTW4 1
|
||||
#endif
|
||||
|
||||
// XL C++ on AIX does not define CRYPTO and does not
|
||||
// provide an option to set it. We have to set it
|
||||
// for the code below. This define must stay in
|
||||
@ -280,6 +289,8 @@ inline uint32x4_p VecLoad(const byte src[16])
|
||||
#if defined(_ARCH_PWR9)
|
||||
// ISA 3.0 provides vec_xl for short* and char*
|
||||
return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
|
||||
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
|
||||
return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
|
||||
#elif defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
// ISA 2.06 provides vec_xl, but it lacks short* and char*
|
||||
return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
|
||||
@ -305,6 +316,8 @@ inline uint32x4_p VecLoad(int off, const byte src[16])
|
||||
#if defined(_ARCH_PWR9)
|
||||
// ISA 3.0 provides vec_xl for short* and char*
|
||||
return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
|
||||
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
|
||||
return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
|
||||
#elif defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
// ISA 2.06 provides vec_xl, but it lacks short* and char*
|
||||
return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
|
||||
@ -400,6 +413,8 @@ inline uint32x4_p VecLoadAligned(const byte src[16])
|
||||
#if defined(_ARCH_PWR9)
|
||||
// ISA 3.0 provides vec_xl for short* and char*
|
||||
return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
|
||||
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
|
||||
return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
|
||||
#elif defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
// ISA 2.06 provides vec_xl, but it lacks short* and char*
|
||||
return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
|
||||
@ -423,6 +438,8 @@ inline uint32x4_p VecLoadAligned(int off, const byte src[16])
|
||||
#if defined(_ARCH_PWR9)
|
||||
// ISA 3.0 provides vec_xl for short* and char*
|
||||
return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
|
||||
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XLW4)
|
||||
return (uint32x4_p)vec_xlw4(off, CONST_V32_CAST(src));
|
||||
#elif defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
// ISA 2.06 provides vec_xl, but it lacks short* and char*
|
||||
return (uint32x4_p)vec_xl(off, CONST_V32_CAST(src));
|
||||
@ -580,6 +597,8 @@ inline void VecStore(const T data, byte dest[16])
|
||||
#if defined(_ARCH_PWR9)
|
||||
// ISA 3.0 provides vec_xl for short* and char*
|
||||
vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
|
||||
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XSTW4)
|
||||
vec_xstw4((uint32x4_p)data, off, NCONST_V32_CAST(dest));
|
||||
#elif defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
// ISA 2.06 provides vec_xl, but it lacks short* and char*
|
||||
vec_xst((uint32x4_p)data, off, NCONST_V32_CAST(dest));
|
||||
@ -608,6 +627,8 @@ inline void VecStore(const T data, int off, byte dest[16])
|
||||
#if defined(_ARCH_PWR9)
|
||||
// ISA 3.0 provides vec_xl for short* and char*
|
||||
vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
|
||||
#elif defined(_ARCH_PWR7) && defined(XLC_VEC_XSTW4)
|
||||
vec_xstw4((uint32x4_p)data, off, NCONST_V32_CAST(dest));
|
||||
#elif defined(_ARCH_PWR7) && defined(__VSX__)
|
||||
// ISA 2.06 provides vec_xl, but it lacks short* and char*
|
||||
vec_xst((uint32x4_p)data, off, NCONST_V32_CAST(dest));
|
||||
|
Loading…
x
Reference in New Issue
Block a user