mirror of
https://github.com/RPCS3/llvm.git
synced 2025-04-03 13:51:39 +00:00
[PowerPC] Enable interleaved-access vectorization
This adds a basic cost model for interleaved-access vectorization (and a better default for shuffles), and enables interleaved-access vectorization by default. The relevant difference from the default cost model for interleaved-access vectorization, is that on PPC, the shuffles that end up being used are *much* cheaper than modeling the process with insert/extract pairs (which are quite expensive, especially on older cores). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246824 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
738216def6
commit
f606a6ed99
@ -207,6 +207,10 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
|
||||
return LoopHasReductions;
|
||||
}
|
||||
|
||||
bool PPCTTIImpl::enableInterleavedAccessVectorization() {
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
|
||||
if (Vector && !ST->hasAltivec() && !ST->hasQPX())
|
||||
return 0;
|
||||
@ -266,7 +270,15 @@ int PPCTTIImpl::getArithmeticInstrCost(
|
||||
|
||||
int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
||||
Type *SubTp) {
|
||||
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
|
||||
// Legalize the type.
|
||||
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
|
||||
|
||||
// PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
|
||||
// (at least in the sense that there need only be one non-loop-invariant
|
||||
// instruction). We need one such shuffle instruction for each actual
|
||||
// register (this is not true for arbitrary shuffles, but is true for the
|
||||
// structured types of shuffles covered by TTI::ShuffleKind).
|
||||
return LT.first;
|
||||
}
|
||||
|
||||
int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
||||
@ -375,3 +387,27 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
return Cost;
|
||||
}
|
||||
|
||||
int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) {
|
||||
assert(isa<VectorType>(VecTy) &&
|
||||
"Expect a vector type for interleaved memory op");
|
||||
|
||||
// Legalize the type.
|
||||
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
|
||||
|
||||
// Firstly, the cost of load/store operation.
|
||||
int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
|
||||
|
||||
// PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
|
||||
// (at least in the sense that there need only be one non-loop-invariant
|
||||
// instruction). For each result vector, we need one shuffle per incoming
|
||||
// vector (except that the first shuffle can take two incoming vectors
|
||||
// because it does not need to take itself).
|
||||
Cost += Factor*(LT.first-1);
|
||||
|
||||
return Cost;
|
||||
}
|
||||
|
||||
|
@ -67,6 +67,7 @@ public:
|
||||
/// @{
|
||||
|
||||
bool enableAggressiveInterleaving(bool LoopHasReductions);
|
||||
bool enableInterleavedAccessVectorization();
|
||||
unsigned getNumberOfRegisters(bool Vector);
|
||||
unsigned getRegisterBitWidth(bool Vector);
|
||||
unsigned getMaxInterleaveFactor(unsigned VF);
|
||||
@ -82,6 +83,11 @@ public:
|
||||
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
|
||||
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
unsigned AddressSpace);
|
||||
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace);
|
||||
|
||||
/// @}
|
||||
};
|
||||
|
@ -0,0 +1,30 @@
|
||||
; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
|
||||
target datalayout = "E-m:e-i64:64-n32:64"
|
||||
target triple = "powerpc64-unknown-linux-gnu"
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; CHECK-LABEL: @foo
|
||||
; CHECK: <2 x double>
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%0 = shl nsw i64 %indvars.iv, 1
|
||||
%arrayidx = getelementptr inbounds double, double* %b, i64 %0
|
||||
%1 = load double, double* %arrayidx, align 8
|
||||
%add = fadd double %1, 1.000000e+00
|
||||
%arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
|
||||
store double %add, double* %arrayidx2, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 1600
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "target-cpu"="pwr8" }
|
||||
|
Loading…
x
Reference in New Issue
Block a user