mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-15 07:59:57 +00:00
TargetTransformInfo: address calculation parameter for gather/scather
Address calculation for gather/scather in vectorized code can incur a significant cost making vectorization unbeneficial. Add infrastructure to add cost. Tests and cost model for targets will be in follow-up commits. radar://14351991 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186187 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
ec11706155
commit
c0a11edba6
@ -339,7 +339,11 @@ public:
|
||||
/// merged into the instruction indexing mode. Some targets might want to
|
||||
/// distinguish between address computation for memory operations on vector
|
||||
/// types and scalar types. Such targets should override this function.
|
||||
virtual unsigned getAddressComputationCost(Type *Ty) const;
|
||||
/// The 'IsComplex' parameter is a hint that the address computation is likely
|
||||
/// to involve multiple instructions and as such unlikely to be merged into
|
||||
/// the address indexing mode.
|
||||
virtual unsigned getAddressComputationCost(Type *Ty,
|
||||
bool IsComplex = false) const;
|
||||
|
||||
/// @}
|
||||
|
||||
|
@ -206,8 +206,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
|
||||
return PrevTTI->getNumberOfParts(Tp);
|
||||
}
|
||||
|
||||
unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const {
|
||||
return PrevTTI->getAddressComputationCost(Tp);
|
||||
unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
|
||||
bool IsComplex) const {
|
||||
return PrevTTI->getAddressComputationCost(Tp, IsComplex);
|
||||
}
|
||||
|
||||
namespace {
|
||||
@ -559,7 +560,7 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned getAddressComputationCost(Type *Tp) const {
|
||||
unsigned getAddressComputationCost(Type *Tp, bool) const {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
@ -108,7 +108,7 @@ public:
|
||||
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
|
||||
ArrayRef<Type*> Tys) const;
|
||||
virtual unsigned getNumberOfParts(Type *Tp) const;
|
||||
virtual unsigned getAddressComputationCost(Type *Ty) const;
|
||||
virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const;
|
||||
|
||||
/// @}
|
||||
};
|
||||
@ -489,6 +489,6 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
|
||||
return LT.first;
|
||||
}
|
||||
|
||||
unsigned BasicTTI::getAddressComputationCost(Type *Ty) const {
|
||||
unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
|
||||
return 0;
|
||||
}
|
||||
|
@ -124,7 +124,7 @@ public:
|
||||
|
||||
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
|
||||
|
||||
unsigned getAddressComputationCost(Type *Val) const;
|
||||
unsigned getAddressComputationCost(Type *Val, bool IsComplex) const;
|
||||
|
||||
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
||||
OperandValueKind Op1Info = OK_AnyValue,
|
||||
@ -425,7 +425,7 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
|
||||
return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
|
||||
}
|
||||
|
||||
unsigned ARMTTI::getAddressComputationCost(Type *Ty) const {
|
||||
unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
|
||||
// In many cases the address computation is not merged into the instruction
|
||||
// addressing mode.
|
||||
return 1;
|
||||
|
@ -4403,6 +4403,59 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
|
||||
return Cost;
|
||||
}
|
||||
|
||||
/// \brief Check whether the address computation for a non-consecutive memory
|
||||
/// access looks like an unlikely candidate for being merged into the indexing
|
||||
/// mode.
|
||||
///
|
||||
/// We look for a GEP which has one index that is an induction variable and all
|
||||
/// other indices are loop invariant. If the stride of this access is also
|
||||
/// within a small bound we decide that this address computation can likely be
|
||||
/// merged into the addressing mode.
|
||||
/// In all other cases, we identify the address computation as complex.
|
||||
static bool isLikelyComplexAddressComputation(Value *Ptr,
|
||||
LoopVectorizationLegality *Legal,
|
||||
ScalarEvolution *SE,
|
||||
const Loop *TheLoop) {
|
||||
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
|
||||
if (!Gep)
|
||||
return true;
|
||||
|
||||
// We are looking for a gep with all loop invariant indices except for one
|
||||
// which should be an induction variable.
|
||||
unsigned NumOperands = Gep->getNumOperands();
|
||||
for (unsigned i = 1; i < NumOperands; ++i) {
|
||||
Value *Opd = Gep->getOperand(i);
|
||||
if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
|
||||
!Legal->isInductionVariable(Opd))
|
||||
return true;
|
||||
}
|
||||
|
||||
// Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
|
||||
// can likely be merged into the address computation.
|
||||
unsigned MaxMergeDistance = 64;
|
||||
|
||||
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
|
||||
if (!AddRec)
|
||||
return true;
|
||||
|
||||
// Check the step is constant.
|
||||
const SCEV *Step = AddRec->getStepRecurrence(*SE);
|
||||
// Calculate the pointer stride and check if it is consecutive.
|
||||
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
|
||||
if (!C)
|
||||
return true;
|
||||
|
||||
const APInt &APStepVal = C->getValue()->getValue();
|
||||
|
||||
// Huge step value - give up.
|
||||
if (APStepVal.getBitWidth() > 64)
|
||||
return true;
|
||||
|
||||
int64_t StepVal = APStepVal.getSExtValue();
|
||||
|
||||
return StepVal > MaxMergeDistance;
|
||||
}
|
||||
|
||||
unsigned
|
||||
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
||||
// If we know that this instruction will remain uniform, check the cost of
|
||||
@ -4498,6 +4551,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
||||
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
|
||||
unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
|
||||
if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
|
||||
bool IsComplexComputation =
|
||||
isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
|
||||
unsigned Cost = 0;
|
||||
// The cost of extracting from the value vector and pointer vector.
|
||||
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
|
||||
@ -4513,7 +4568,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
||||
}
|
||||
|
||||
// The cost of the scalar loads/stores.
|
||||
Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
|
||||
Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
|
||||
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
|
||||
Alignment, AS);
|
||||
return Cost;
|
||||
|
Loading…
Reference in New Issue
Block a user