mirror of
https://github.com/RPCSX/llvm.git
synced 2024-12-12 14:17:59 +00:00
Teach the SLP Vectorizer that keeping some values live over a callsite can have a cost.
Some types, such as 128-bit vector types on AArch64, don't have any callee-saved registers. So if a value needs to stay live over a callsite, it must be spilled and refilled. This cost is now taken into account. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214859 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
ff8028c8da
commit
72035e9a8e
@ -416,6 +416,13 @@ public:
|
||||
virtual unsigned getAddressComputationCost(Type *Ty,
|
||||
bool IsComplex = false) const;
|
||||
|
||||
/// \returns The cost, if any, of keeping values of the given types alive
|
||||
/// over a callsite.
|
||||
///
|
||||
/// Some types may require the use of register classes that do not have
|
||||
/// any callee-saved registers, so would require a spill and fill.
|
||||
virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const;
|
||||
|
||||
/// @}
|
||||
|
||||
/// Analysis group identification.
|
||||
|
@ -230,6 +230,11 @@ unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
|
||||
return PrevTTI->getReductionCost(Opcode, Ty, IsPairwise);
|
||||
}
|
||||
|
||||
unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys)
|
||||
const {
|
||||
return PrevTTI->getCostOfKeepingLiveOverCall(Tys);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct NoTTI final : ImmutablePass, TargetTransformInfo {
|
||||
@ -613,6 +618,11 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
|
||||
unsigned getReductionCost(unsigned, Type *, bool) const override {
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
@ -124,6 +124,9 @@ public:
|
||||
|
||||
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
unsigned AddressSpace) const override;
|
||||
|
||||
unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override;
|
||||
|
||||
/// @}
|
||||
};
|
||||
|
||||
@ -498,3 +501,15 @@ unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
|
||||
return LT.first;
|
||||
}
|
||||
|
||||
unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const {
|
||||
unsigned Cost = 0;
|
||||
for (auto *I : Tys) {
|
||||
if (!I->isVectorTy())
|
||||
continue;
|
||||
if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
|
||||
Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
|
||||
getMemoryOpCost(Instruction::Load, I, 128, 0);
|
||||
}
|
||||
return Cost;
|
||||
}
|
||||
|
@ -361,6 +361,10 @@ public:
|
||||
/// Returns the vectorized root.
|
||||
Value *vectorizeTree();
|
||||
|
||||
/// \returns the cost incurred by unwanted spills and fills, caused by
|
||||
/// holding live values over call sites.
|
||||
int getSpillCost();
|
||||
|
||||
/// \returns the vectorization cost of the subtree that starts at \p VL.
|
||||
/// A negative number means that this is profitable.
|
||||
int getTreeCost();
|
||||
@ -1543,6 +1547,68 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
|
||||
return true;
|
||||
}
|
||||
|
||||
int BoUpSLP::getSpillCost() {
|
||||
// Walk from the bottom of the tree to the top, tracking which values are
|
||||
// live. When we see a call instruction that is not part of our tree,
|
||||
// query TTI to see if there is a cost to keeping values live over it
|
||||
// (for example, if spills and fills are required).
|
||||
unsigned BundleWidth = VectorizableTree.front().Scalars.size();
|
||||
int Cost = 0;
|
||||
|
||||
SmallPtrSet<Instruction*, 4> LiveValues;
|
||||
Instruction *PrevInst = nullptr;
|
||||
|
||||
for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
|
||||
Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
|
||||
if (!Inst)
|
||||
continue;
|
||||
|
||||
if (!PrevInst) {
|
||||
PrevInst = Inst;
|
||||
continue;
|
||||
}
|
||||
|
||||
DEBUG(
|
||||
dbgs() << "SLP: #LV: " << LiveValues.size();
|
||||
for (auto *X : LiveValues)
|
||||
dbgs() << " " << X->getName();
|
||||
dbgs() << ", Looking at ";
|
||||
Inst->dump();
|
||||
);
|
||||
|
||||
// Update LiveValues.
|
||||
LiveValues.erase(PrevInst);
|
||||
for (auto &J : PrevInst->operands()) {
|
||||
if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
|
||||
LiveValues.insert(cast<Instruction>(&*J));
|
||||
}
|
||||
|
||||
// Now find the sequence of instructions between PrevInst and Inst.
|
||||
BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
|
||||
--PrevInstIt;
|
||||
while (InstIt != PrevInstIt) {
|
||||
if (PrevInstIt == PrevInst->getParent()->rend()) {
|
||||
PrevInstIt = Inst->getParent()->rbegin();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
|
||||
SmallVector<Type*, 4> V;
|
||||
for (auto *II : LiveValues)
|
||||
V.push_back(VectorType::get(II->getType(), BundleWidth));
|
||||
Cost += TTI->getCostOfKeepingLiveOverCall(V);
|
||||
}
|
||||
|
||||
++PrevInstIt;
|
||||
}
|
||||
|
||||
PrevInst = Inst;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
|
||||
return Cost;
|
||||
}
|
||||
|
||||
int BoUpSLP::getTreeCost() {
|
||||
int Cost = 0;
|
||||
DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
|
||||
@ -1578,6 +1644,8 @@ int BoUpSLP::getTreeCost() {
|
||||
I->Lane);
|
||||
}
|
||||
|
||||
Cost += getSpillCost();
|
||||
|
||||
DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
|
||||
return Cost + ExtractCost;
|
||||
}
|
||||
|
46
test/Transforms/SLPVectorizer/AArch64/load-store-q.ll
Normal file
46
test/Transforms/SLPVectorizer/AArch64/load-store-q.ll
Normal file
@ -0,0 +1,46 @@
|
||||
; RUN: opt -S -basicaa -slp-vectorizer < %s | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "arm64-apple-ios5.0.0"
|
||||
|
||||
; Holding a value live over a call boundary may require
|
||||
; spills and fills. This is the case for <2 x double>,
|
||||
; as it occupies a Q register of which there are no
|
||||
; callee-saves.
|
||||
|
||||
; CHECK: load double
|
||||
; CHECK: load double
|
||||
; CHECK: call void @g
|
||||
; CHECK: store double
|
||||
; CHECK: store double
|
||||
define void @f(double* %p, double* %q) {
|
||||
%addr2 = getelementptr double* %q, i32 1
|
||||
%addr = getelementptr double* %p, i32 1
|
||||
%x = load double* %p
|
||||
%y = load double* %addr
|
||||
call void @g()
|
||||
store double %x, double* %q
|
||||
store double %y, double* %addr2
|
||||
ret void
|
||||
}
|
||||
declare void @g()
|
||||
|
||||
; Check we deal with loops correctly.
|
||||
;
|
||||
; CHECK: store <2 x double>
|
||||
; CHECK: load <2 x double>
|
||||
define void @f2(double* %p, double* %q) {
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%p1 = phi double [0.0, %entry], [%x, %loop]
|
||||
%p2 = phi double [0.0, %entry], [%y, %loop]
|
||||
%addr2 = getelementptr double* %q, i32 1
|
||||
%addr = getelementptr double* %p, i32 1
|
||||
store double %p1, double* %q
|
||||
store double %p2, double* %addr2
|
||||
|
||||
%x = load double* %p
|
||||
%y = load double* %addr
|
||||
br label %loop
|
||||
}
|
Loading…
Reference in New Issue
Block a user