mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-25 04:39:44 +00:00
ARM cost model: Address computation in vector mem ops not free
Adds a function to target transform info to query for the cost of address computation. The cost model analysis pass now also queries this interface. The code in LoopVectorize adds the cost of address computation as part of the memory instruction cost calculation. Only there, we know whether the instruction will be scalarized or not. Increase the penality for inserting in to D registers on swift. This becomes necessary because we now always assume that address computation has a cost and three is a closer value to the architecture. radar://13097204 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@174713 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f64edf8d80
commit
fb55a8fd7c
@ -314,6 +314,12 @@ public:
|
|||||||
/// split during legalization. Zero is returned when the answer is unknown.
|
/// split during legalization. Zero is returned when the answer is unknown.
|
||||||
virtual unsigned getNumberOfParts(Type *Tp) const;
|
virtual unsigned getNumberOfParts(Type *Tp) const;
|
||||||
|
|
||||||
|
/// \returns The cost of the address computation. For most targets this can be
|
||||||
|
/// merged into the instruction indexing mode. Some targets might want to
|
||||||
|
/// distinguish between address computation for memory operations on vector
|
||||||
|
/// types and scalar types. Such targets should override this function.
|
||||||
|
virtual unsigned getAddressComputationCost(Type *Ty) const;
|
||||||
|
|
||||||
/// @}
|
/// @}
|
||||||
|
|
||||||
/// Analysis group identification.
|
/// Analysis group identification.
|
||||||
|
@ -85,6 +85,11 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
|
|||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
switch (I->getOpcode()) {
|
switch (I->getOpcode()) {
|
||||||
|
case Instruction::GetElementPtr:{
|
||||||
|
Type *ValTy = I->getOperand(0)->getType()->getPointerElementType();
|
||||||
|
return TTI->getAddressComputationCost(ValTy);
|
||||||
|
}
|
||||||
|
|
||||||
case Instruction::Ret:
|
case Instruction::Ret:
|
||||||
case Instruction::PHI:
|
case Instruction::PHI:
|
||||||
case Instruction::Br: {
|
case Instruction::Br: {
|
||||||
|
@ -196,6 +196,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
|
|||||||
return PrevTTI->getNumberOfParts(Tp);
|
return PrevTTI->getNumberOfParts(Tp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const {
|
||||||
|
return PrevTTI->getAddressComputationCost(Tp);
|
||||||
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
@ -535,6 +538,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
|
|||||||
unsigned getNumberOfParts(Type *Tp) const {
|
unsigned getNumberOfParts(Type *Tp) const {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned getAddressComputationCost(Type *Tp) const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // end anonymous namespace
|
} // end anonymous namespace
|
||||||
|
@ -101,6 +101,7 @@ public:
|
|||||||
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
|
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
|
||||||
ArrayRef<Type*> Tys) const;
|
ArrayRef<Type*> Tys) const;
|
||||||
virtual unsigned getNumberOfParts(Type *Tp) const;
|
virtual unsigned getNumberOfParts(Type *Tp) const;
|
||||||
|
virtual unsigned getAddressComputationCost(Type *Ty) const;
|
||||||
|
|
||||||
/// @}
|
/// @}
|
||||||
};
|
};
|
||||||
@ -400,3 +401,7 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
|
|||||||
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
|
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
|
||||||
return LT.first;
|
return LT.first;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned BasicTTI::getAddressComputationCost(Type *Ty) const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
@ -120,6 +120,8 @@ public:
|
|||||||
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const;
|
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const;
|
||||||
|
|
||||||
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
|
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
|
||||||
|
|
||||||
|
unsigned getAddressComputationCost(Type *Val) const;
|
||||||
/// @}
|
/// @}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -304,12 +306,13 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
|
|||||||
|
|
||||||
unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
|
unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
|
||||||
unsigned Index) const {
|
unsigned Index) const {
|
||||||
// Penalize inserting into an D-subregister.
|
// Penalize inserting into an D-subregister. We end up with a three times
|
||||||
|
// lower estimated throughput on swift.
|
||||||
if (ST->isSwift() &&
|
if (ST->isSwift() &&
|
||||||
Opcode == Instruction::InsertElement &&
|
Opcode == Instruction::InsertElement &&
|
||||||
ValTy->isVectorTy() &&
|
ValTy->isVectorTy() &&
|
||||||
ValTy->getScalarSizeInBits() <= 32)
|
ValTy->getScalarSizeInBits() <= 32)
|
||||||
return 2;
|
return 3;
|
||||||
|
|
||||||
return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
|
return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
|
||||||
}
|
}
|
||||||
@ -326,3 +329,9 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
|
|||||||
|
|
||||||
return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
|
return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned ARMTTI::getAddressComputationCost(Type *Ty) const {
|
||||||
|
// In many cases the address computation is not merged into the instruction
|
||||||
|
// addressing mode.
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
@ -3056,9 +3056,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
|||||||
// TODO: We need to estimate the cost of intrinsic calls.
|
// TODO: We need to estimate the cost of intrinsic calls.
|
||||||
switch (I->getOpcode()) {
|
switch (I->getOpcode()) {
|
||||||
case Instruction::GetElementPtr:
|
case Instruction::GetElementPtr:
|
||||||
// We mark this instruction as zero-cost because scalar GEPs are usually
|
// We mark this instruction as zero-cost because the cost of GEPs in
|
||||||
// lowered to the intruction addressing mode. At the moment we don't
|
// vectorized code depends on whether the corresponding memory instruction
|
||||||
// generate vector geps.
|
// is scalarized or not. Therefore, we handle GEPs with the memory
|
||||||
|
// instruction cost.
|
||||||
return 0;
|
return 0;
|
||||||
case Instruction::Br: {
|
case Instruction::Br: {
|
||||||
return TTI.getCFInstrCost(I->getOpcode());
|
return TTI.getCFInstrCost(I->getOpcode());
|
||||||
@ -3113,9 +3114,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
|||||||
unsigned AS = SI ? SI->getPointerAddressSpace() :
|
unsigned AS = SI ? SI->getPointerAddressSpace() :
|
||||||
LI->getPointerAddressSpace();
|
LI->getPointerAddressSpace();
|
||||||
Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
|
Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
|
||||||
|
// We add the cost of address computation here instead of with the gep
|
||||||
|
// instruction because only here we know whether the operation is
|
||||||
|
// scalarized.
|
||||||
if (VF == 1)
|
if (VF == 1)
|
||||||
return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
|
return TTI.getAddressComputationCost(VectorTy) +
|
||||||
|
TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
|
||||||
|
|
||||||
// Scalarized loads/stores.
|
// Scalarized loads/stores.
|
||||||
int Stride = Legal->isConsecutivePtr(Ptr);
|
int Stride = Legal->isConsecutivePtr(Ptr);
|
||||||
@ -3135,15 +3139,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
|||||||
VectorTy, i);
|
VectorTy, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The cost of the scalar stores.
|
// The cost of the scalar loads/stores.
|
||||||
|
Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
|
||||||
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
|
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
|
||||||
Alignment, AS);
|
Alignment, AS);
|
||||||
return Cost;
|
return Cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wide load/stores.
|
// Wide load/stores.
|
||||||
unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
|
unsigned Cost = TTI.getAddressComputationCost(VectorTy);
|
||||||
Alignment, AS);
|
Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
|
||||||
|
|
||||||
if (Reverse)
|
if (Reverse)
|
||||||
Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
|
Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
|
||||||
VectorTy, 0);
|
VectorTy, 0);
|
||||||
|
43
test/Analysis/CostModel/ARM/gep.ll
Normal file
43
test/Analysis/CostModel/ARM/gep.ll
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
; RUN: opt -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
|
||||||
|
|
||||||
|
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
|
||||||
|
target triple = "thumbv7-apple-ios6.0.0"
|
||||||
|
|
||||||
|
define void @test_geps() {
|
||||||
|
; Cost of scalar integer geps should be one. We can't always expect it to be
|
||||||
|
; folded into the instruction addressing mode.
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8*
|
||||||
|
%a0 = getelementptr inbounds i8* undef, i32 0
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16*
|
||||||
|
%a1 = getelementptr inbounds i16* undef, i32 0
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32*
|
||||||
|
%a2 = getelementptr inbounds i32* undef, i32 0
|
||||||
|
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64*
|
||||||
|
%a3 = getelementptr inbounds i64* undef, i32 0
|
||||||
|
|
||||||
|
; Cost of scalar floating point geps should be one. We cannot fold the address
|
||||||
|
; computation.
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float*
|
||||||
|
%a4 = getelementptr inbounds float* undef, i32 0
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double*
|
||||||
|
%a5 = getelementptr inbounds double* undef, i32 0
|
||||||
|
|
||||||
|
|
||||||
|
; Cost of vector geps should be one. We cannot fold the address computation.
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i8>*
|
||||||
|
%a7 = getelementptr inbounds <4 x i8>* undef, i32 0
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i16>*
|
||||||
|
%a8 = getelementptr inbounds <4 x i16>* undef, i32 0
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>*
|
||||||
|
%a9 = getelementptr inbounds <4 x i32>* undef, i32 0
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>*
|
||||||
|
%a10 = getelementptr inbounds <4 x i64>* undef, i32 0
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>*
|
||||||
|
%a11 = getelementptr inbounds <4 x float>* undef, i32 0
|
||||||
|
;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>*
|
||||||
|
%a12 = getelementptr inbounds <4 x double>* undef, i32 0
|
||||||
|
|
||||||
|
|
||||||
|
ret void
|
||||||
|
}
|
@ -12,7 +12,7 @@ define void @insertelement_i8(%T_i8* %saddr,
|
|||||||
%T_i8v* %vaddr) {
|
%T_i8v* %vaddr) {
|
||||||
%v0 = load %T_i8v* %vaddr
|
%v0 = load %T_i8v* %vaddr
|
||||||
%v1 = load %T_i8* %saddr
|
%v1 = load %T_i8* %saddr
|
||||||
;CHECK: estimated cost of 2 for {{.*}} insertelement <8 x i8>
|
;CHECK: estimated cost of 3 for {{.*}} insertelement <8 x i8>
|
||||||
%v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1
|
%v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1
|
||||||
store %T_i8v %v2, %T_i8v* %vaddr
|
store %T_i8v %v2, %T_i8v* %vaddr
|
||||||
ret void
|
ret void
|
||||||
@ -26,7 +26,7 @@ define void @insertelement_i16(%T_i16* %saddr,
|
|||||||
%T_i16v* %vaddr) {
|
%T_i16v* %vaddr) {
|
||||||
%v0 = load %T_i16v* %vaddr
|
%v0 = load %T_i16v* %vaddr
|
||||||
%v1 = load %T_i16* %saddr
|
%v1 = load %T_i16* %saddr
|
||||||
;CHECK: estimated cost of 2 for {{.*}} insertelement <4 x i16>
|
;CHECK: estimated cost of 3 for {{.*}} insertelement <4 x i16>
|
||||||
%v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1
|
%v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1
|
||||||
store %T_i16v %v2, %T_i16v* %vaddr
|
store %T_i16v %v2, %T_i16v* %vaddr
|
||||||
ret void
|
ret void
|
||||||
@ -39,7 +39,7 @@ define void @insertelement_i32(%T_i32* %saddr,
|
|||||||
%T_i32v* %vaddr) {
|
%T_i32v* %vaddr) {
|
||||||
%v0 = load %T_i32v* %vaddr
|
%v0 = load %T_i32v* %vaddr
|
||||||
%v1 = load %T_i32* %saddr
|
%v1 = load %T_i32* %saddr
|
||||||
;CHECK: estimated cost of 2 for {{.*}} insertelement <2 x i32>
|
;CHECK: estimated cost of 3 for {{.*}} insertelement <2 x i32>
|
||||||
%v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1
|
%v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1
|
||||||
store %T_i32v %v2, %T_i32v* %vaddr
|
store %T_i32v %v2, %T_i32v* %vaddr
|
||||||
ret void
|
ret void
|
||||||
|
40
test/Analysis/CostModel/X86/gep.ll
Normal file
40
test/Analysis/CostModel/X86/gep.ll
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
||||||
|
|
||||||
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||||
|
target triple = "x86_64-apple-macosx10.8.0"
|
||||||
|
|
||||||
|
|
||||||
|
define void @test_geps() {
|
||||||
|
; Cost of should be zero. We expect it to be folded into
|
||||||
|
; the instruction addressing mode.
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8*
|
||||||
|
%a0 = getelementptr inbounds i8* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16*
|
||||||
|
%a1 = getelementptr inbounds i16* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32*
|
||||||
|
%a2 = getelementptr inbounds i32* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64*
|
||||||
|
%a3 = getelementptr inbounds i64* undef, i32 0
|
||||||
|
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float*
|
||||||
|
%a4 = getelementptr inbounds float* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double*
|
||||||
|
%a5 = getelementptr inbounds double* undef, i32 0
|
||||||
|
|
||||||
|
; Vector geps should also have zero cost.
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>*
|
||||||
|
%a7 = getelementptr inbounds <4 x i8>* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>*
|
||||||
|
%a8 = getelementptr inbounds <4 x i16>* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>*
|
||||||
|
%a9 = getelementptr inbounds <4 x i32>* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>*
|
||||||
|
%a10 = getelementptr inbounds <4 x i64>* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>*
|
||||||
|
%a11 = getelementptr inbounds <4 x float>* undef, i32 0
|
||||||
|
;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>*
|
||||||
|
%a12 = getelementptr inbounds <4 x double>* undef, i32 0
|
||||||
|
|
||||||
|
|
||||||
|
ret void
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user