mirror of
https://github.com/RPCS3/llvm.git
synced 2025-02-03 17:24:24 +00:00
[SLP] Enable 64-bit wide vectorization on AArch64
ARM Neon has native support for half-sized vector registers (64 bits). This is beneficial for example for 2D and 3D graphics. This patch adds the option to lower MinVecRegSize from 128 via a TTI in the SLP Vectorizer. *** Performance Analysis This change was motivated by some internal benchmarks but it is also beneficial on SPEC and the LLVM testsuite. The results are with -O3 and PGO. A negative percentage is an improvement. The testsuite was run with a sample size of 4. ** SPEC * CFP2006/482.sphinx3 -3.34% A pretty hot loop is SLP vectorized resulting in nice instruction reduction. This used to be a +22% regression before rL299482. * CFP2000/177.mesa -3.34% * CINT2000/256.bzip2 +6.97% My current plan is to extend the fix in rL299482 to i16 which brings the regression down to +2.5%. There are also other problems with the codegen in this loop so there is further room for improvement. ** LLVM testsuite * SingleSource/Benchmarks/Misc/ReedSolomon -10.75% There are multiple small SLP vectorizations outside the hot code. It's a bit surprising that it adds up to 10%. Some of this may be code-layout noise. * MultiSource/Benchmarks/VersaBench/beamformer/beamformer -8.40% The opt-viewer screenshot can be seen at F3218284. We start at a colder store but the tree leads us into the hottest loop. * MultiSource/Applications/lambda-0.1.3/lambda -2.68% * MultiSource/Benchmarks/Bullet/bullet -2.18% This is using 3D vectors. * SingleSource/Benchmarks/Shootout-C++/Shootout-C++-lists +6.67% Noise, binary is unchanged. * MultiSource/Benchmarks/Ptrdist/anagram/anagram +4.90% There is an additional SLP in the cold code. The test runs for ~1sec and prints out over 2000 lines. This is most likely noise. * MultiSource/Applications/aha/aha +1.63% * MultiSource/Applications/JM/lencod/lencod +1.41% * SingleSource/Benchmarks/Misc/richards_benchmark +1.15% Differential Revision: https://reviews.llvm.org/D31965 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303116 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
05f671ecfa
commit
2efa5091b6
@ -537,6 +537,9 @@ public:
|
||||
/// \return The width of the largest scalar or vector register type.
|
||||
unsigned getRegisterBitWidth(bool Vector) const;
|
||||
|
||||
/// \return The width of the smallest vector register type.
|
||||
unsigned getMinVectorRegisterBitWidth() const;
|
||||
|
||||
/// \return True if it should be considered for address type promotion.
|
||||
/// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
|
||||
/// profitable without finding other extensions fed by the same input.
|
||||
@ -840,6 +843,7 @@ public:
|
||||
Type *Ty) = 0;
|
||||
virtual unsigned getNumberOfRegisters(bool Vector) = 0;
|
||||
virtual unsigned getRegisterBitWidth(bool Vector) = 0;
|
||||
virtual unsigned getMinVectorRegisterBitWidth() = 0;
|
||||
virtual bool shouldConsiderAddressTypePromotion(
|
||||
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
|
||||
virtual unsigned getCacheLineSize() = 0;
|
||||
@ -1076,6 +1080,9 @@ public:
|
||||
unsigned getRegisterBitWidth(bool Vector) override {
|
||||
return Impl.getRegisterBitWidth(Vector);
|
||||
}
|
||||
unsigned getMinVectorRegisterBitWidth() override {
|
||||
return Impl.getMinVectorRegisterBitWidth();
|
||||
}
|
||||
bool shouldConsiderAddressTypePromotion(
|
||||
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
|
||||
return Impl.shouldConsiderAddressTypePromotion(
|
||||
|
@ -311,6 +311,8 @@ public:
|
||||
|
||||
unsigned getRegisterBitWidth(bool Vector) { return 32; }
|
||||
|
||||
unsigned getMinVectorRegisterBitWidth() { return 128; }
|
||||
|
||||
bool
|
||||
shouldConsiderAddressTypePromotion(const Instruction &I,
|
||||
bool &AllowPromotionWithoutCommonHeader) {
|
||||
|
@ -279,6 +279,10 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
|
||||
return TTIImpl->getRegisterBitWidth(Vector);
|
||||
}
|
||||
|
||||
unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
|
||||
return TTIImpl->getMinVectorRegisterBitWidth();
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
|
||||
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
|
||||
return TTIImpl->shouldConsiderAddressTypePromotion(
|
||||
|
@ -91,6 +91,8 @@ void AArch64Subtarget::initializeProperties() {
|
||||
case Falkor:
|
||||
MaxInterleaveFactor = 4;
|
||||
VectorInsertExtractBaseCost = 2;
|
||||
// FIXME: remove this to enable 64-bit SLP if performance looks good.
|
||||
MinVectorRegisterBitWidth = 128;
|
||||
break;
|
||||
case Kryo:
|
||||
MaxInterleaveFactor = 4;
|
||||
@ -99,6 +101,8 @@ void AArch64Subtarget::initializeProperties() {
|
||||
PrefetchDistance = 740;
|
||||
MinPrefetchStride = 1024;
|
||||
MaxPrefetchIterationsAhead = 11;
|
||||
// FIXME: remove this to enable 64-bit SLP if performance looks good.
|
||||
MinVectorRegisterBitWidth = 128;
|
||||
break;
|
||||
case ThunderX2T99:
|
||||
CacheLineSize = 64;
|
||||
@ -108,6 +112,8 @@ void AArch64Subtarget::initializeProperties() {
|
||||
PrefetchDistance = 128;
|
||||
MinPrefetchStride = 1024;
|
||||
MaxPrefetchIterationsAhead = 4;
|
||||
// FIXME: remove this to enable 64-bit SLP if performance looks good.
|
||||
MinVectorRegisterBitWidth = 128;
|
||||
break;
|
||||
case ThunderX:
|
||||
case ThunderXT88:
|
||||
@ -116,6 +122,8 @@ void AArch64Subtarget::initializeProperties() {
|
||||
CacheLineSize = 128;
|
||||
PrefFunctionAlignment = 3;
|
||||
PrefLoopAlignment = 2;
|
||||
// FIXME: remove this to enable 64-bit SLP if performance looks good.
|
||||
MinVectorRegisterBitWidth = 128;
|
||||
break;
|
||||
case CortexA35: break;
|
||||
case CortexA53: break;
|
||||
|
@ -83,6 +83,9 @@ protected:
|
||||
// NegativeImmediates - transform instructions with negative immediates
|
||||
bool NegativeImmediates = true;
|
||||
|
||||
// Enable 64-bit vectorization in SLP.
|
||||
unsigned MinVectorRegisterBitWidth = 64;
|
||||
|
||||
bool UseAA = false;
|
||||
bool PredictableSelectIsExpensive = false;
|
||||
bool BalanceFPOps = false;
|
||||
@ -191,6 +194,10 @@ public:
|
||||
|
||||
bool isXRaySupported() const override { return true; }
|
||||
|
||||
unsigned getMinVectorRegisterBitWidth() const {
|
||||
return MinVectorRegisterBitWidth;
|
||||
}
|
||||
|
||||
bool isX18Reserved() const { return ReserveX18; }
|
||||
bool hasFPARMv8() const { return HasFPARMv8; }
|
||||
bool hasNEON() const { return HasNEON; }
|
||||
|
@ -87,6 +87,10 @@ public:
|
||||
return 64;
|
||||
}
|
||||
|
||||
unsigned getMinVectorRegisterBitWidth() {
|
||||
return ST->getMinVectorRegisterBitWidth();
|
||||
}
|
||||
|
||||
unsigned getMaxInterleaveFactor(unsigned VF);
|
||||
|
||||
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
||||
|
@ -316,7 +316,10 @@ public:
|
||||
else
|
||||
MaxVecRegSize = TTI->getRegisterBitWidth(true);
|
||||
|
||||
MinVecRegSize = MinVectorRegSizeOption;
|
||||
if (MinVectorRegSizeOption.getNumOccurrences())
|
||||
MinVecRegSize = MinVectorRegSizeOption;
|
||||
else
|
||||
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
|
||||
}
|
||||
|
||||
/// \brief Vectorize the tree that starts with the elements in \p VL.
|
||||
|
22
test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
Normal file
22
test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
Normal file
@ -0,0 +1,22 @@
|
||||
; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
|
||||
; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s
|
||||
; Currently disabled for a few subtargets (e.g. Kryo):
|
||||
; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s
|
||||
; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s
|
||||
|
||||
define void @f(float* %r, float* %w) {
|
||||
%r0 = getelementptr inbounds float, float* %r, i64 0
|
||||
%r1 = getelementptr inbounds float, float* %r, i64 1
|
||||
%f0 = load float, float* %r0
|
||||
%f1 = load float, float* %r1
|
||||
%add0 = fadd float %f0, %f0
|
||||
; CHECK: fadd <2 x float>
|
||||
; NO_SLP: fadd float
|
||||
; NO_SLP: fadd float
|
||||
%add1 = fadd float %f1, %f1
|
||||
%w0 = getelementptr inbounds float, float* %w, i64 0
|
||||
%w1 = getelementptr inbounds float, float* %w, i64 1
|
||||
store float %add0, float* %w0
|
||||
store float %add1, float* %w1
|
||||
ret void
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user