mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-04-01 12:43:47 +00:00
[PowerPC] Update Vector Costs for P9
For the power9 CPU, vector operations consume a pair of execution units rather than one execution unit like a scalar operation. Update the target transform cost functions to reflect the higher cost of vector operations when targeting Power9. Patch by RolandF. Differential revision: https://reviews.llvm.org/D55461 llvm-svn: 352261
This commit is contained in:
parent
7a8e74775c
commit
7d007ddedf
@ -190,6 +190,13 @@ def FeatureP9Vector : SubtargetFeature<"power9-vector", "HasP9Vector", "true",
|
||||
"Enable POWER9 vector instructions",
|
||||
[FeatureISA3_0, FeatureP8Vector,
|
||||
FeatureP9Altivec]>;
|
||||
// A separate feature for this even though it is equivalent to P9Vector
|
||||
// because this is a feature of the implementation rather than the architecture
|
||||
// and may go away with future CPU's.
|
||||
def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
|
||||
"VectorsUseTwoUnits",
|
||||
"true",
|
||||
"Vectors use two units">;
|
||||
|
||||
// Since new processors generally contain a superset of features of those that
|
||||
// came before them, the idea is to make implementations of new processors
|
||||
@ -222,7 +229,8 @@ def ProcessorFeatures {
|
||||
list<SubtargetFeature> Power8FeatureList =
|
||||
!listconcat(Power7FeatureList, Power8SpecificFeatures);
|
||||
list<SubtargetFeature> Power9SpecificFeatures =
|
||||
[DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0];
|
||||
[DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0,
|
||||
FeatureVectorsUseTwoUnits];
|
||||
list<SubtargetFeature> Power9FeatureList =
|
||||
!listconcat(Power8FeatureList, Power9SpecificFeatures);
|
||||
}
|
||||
|
@ -107,6 +107,7 @@ void PPCSubtarget::initializeEnvironment() {
|
||||
IsISA3_0 = false;
|
||||
UseLongCalls = false;
|
||||
SecurePlt = false;
|
||||
VectorsUseTwoUnits = false;
|
||||
|
||||
HasPOPCNTD = POPCNTD_Unavailable;
|
||||
}
|
||||
|
@ -135,6 +135,7 @@ protected:
|
||||
bool IsISA3_0;
|
||||
bool UseLongCalls;
|
||||
bool SecurePlt;
|
||||
bool VectorsUseTwoUnits;
|
||||
|
||||
POPCNTDKind HasPOPCNTD;
|
||||
|
||||
@ -259,6 +260,7 @@ public:
|
||||
bool isPPC4xx() const { return IsPPC4xx; }
|
||||
bool isPPC6xx() const { return IsPPC6xx; }
|
||||
bool isSecurePlt() const {return SecurePlt; }
|
||||
bool vectorsUseTwoUnits() const {return VectorsUseTwoUnits; }
|
||||
bool isE500() const { return IsE500; }
|
||||
bool isFeatureMFTB() const { return FeatureMFTB; }
|
||||
bool isDeprecatedDST() const { return DeprecatedDST; }
|
||||
|
@ -323,6 +323,32 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Adjust the cost of vector instructions on targets which there is overlap
|
||||
// between the vector and scalar units, thereby reducing the overall throughput
|
||||
// of vector code wrt. scalar code.
|
||||
int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1,
|
||||
Type *Ty2) {
|
||||
if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
|
||||
return Cost;
|
||||
|
||||
std::pair<int, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
|
||||
// If type legalization involves splitting the vector, we don't want to
|
||||
// double the cost at every step - only the last step.
|
||||
if (LT1.first != 1 || !LT1.second.isVector())
|
||||
return Cost;
|
||||
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
||||
if (TLI->isOperationExpand(ISD, LT1.second))
|
||||
return Cost;
|
||||
|
||||
if (Ty2) {
|
||||
std::pair<int, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
|
||||
if (LT2.first != 1 || !LT2.second.isVector())
|
||||
return Cost;
|
||||
}
|
||||
|
||||
return Cost * 2;
|
||||
}
|
||||
|
||||
int PPCTTIImpl::getArithmeticInstrCost(
|
||||
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
|
||||
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
|
||||
@ -330,8 +356,9 @@ int PPCTTIImpl::getArithmeticInstrCost(
|
||||
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
|
||||
|
||||
// Fallback to the default implementation.
|
||||
return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
|
||||
Opd1PropInfo, Opd2PropInfo);
|
||||
int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
|
||||
Opd1PropInfo, Opd2PropInfo);
|
||||
return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
|
||||
}
|
||||
|
||||
int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
||||
@ -344,19 +371,22 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
||||
// instruction). We need one such shuffle instruction for each actual
|
||||
// register (this is not true for arbitrary shuffles, but is true for the
|
||||
// structured types of shuffles covered by TTI::ShuffleKind).
|
||||
return LT.first;
|
||||
return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
||||
const Instruction *I) {
|
||||
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
|
||||
|
||||
return BaseT::getCastInstrCost(Opcode, Dst, Src);
|
||||
int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src);
|
||||
return vectorCostAdjustment(Cost, Opcode, Dst, Src);
|
||||
}
|
||||
|
||||
int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
|
||||
const Instruction *I) {
|
||||
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
|
||||
int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
|
||||
return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
|
||||
}
|
||||
|
||||
int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
|
||||
@ -365,18 +395,22 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
|
||||
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
||||
assert(ISD && "Invalid opcode");
|
||||
|
||||
int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
|
||||
Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr);
|
||||
|
||||
if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
|
||||
// Double-precision scalars are already located in index #0.
|
||||
if (Index == 0)
|
||||
// Double-precision scalars are already located in index #0 (or #1 if LE).
|
||||
if (ISD == ISD::EXTRACT_VECTOR_ELT && Index == ST->isLittleEndian() ? 1 : 0)
|
||||
return 0;
|
||||
|
||||
return BaseT::getVectorInstrCost(Opcode, Val, Index);
|
||||
return Cost;
|
||||
|
||||
} else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
|
||||
// Floating point scalars are already located in index #0.
|
||||
if (Index == 0)
|
||||
return 0;
|
||||
|
||||
return BaseT::getVectorInstrCost(Opcode, Val, Index);
|
||||
return Cost;
|
||||
}
|
||||
|
||||
// Estimated cost of a load-hit-store delay. This was obtained
|
||||
@ -393,9 +427,9 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
|
||||
// these need to be estimated as very costly.
|
||||
if (ISD == ISD::EXTRACT_VECTOR_ELT ||
|
||||
ISD == ISD::INSERT_VECTOR_ELT)
|
||||
return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
|
||||
return LHSPenalty + Cost;
|
||||
|
||||
return BaseT::getVectorInstrCost(Opcode, Val, Index);
|
||||
return Cost;
|
||||
}
|
||||
|
||||
int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
@ -406,6 +440,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
"Invalid Opcode");
|
||||
|
||||
int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
||||
Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr);
|
||||
|
||||
bool IsAltivecType = ST->hasAltivec() &&
|
||||
(LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
|
||||
|
@ -70,6 +70,7 @@ public:
|
||||
unsigned getCacheLineSize();
|
||||
unsigned getPrefetchDistance();
|
||||
unsigned getMaxInterleaveFactor(unsigned VF);
|
||||
int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
|
||||
int getArithmeticInstrCost(
|
||||
unsigned Opcode, Type *Ty,
|
||||
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
|
||||
|
68
llvm/test/Analysis/CostModel/PowerPC/p9.ll
Normal file
68
llvm/test/Analysis/CostModel/PowerPC/p9.ll
Normal file
@ -0,0 +1,68 @@
|
||||
; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s
|
||||
; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-P9 %s
|
||||
; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-LE %s
|
||||
|
||||
define void @testi16(i16 %arg1, i16 %arg2, i16* %arg3) {
|
||||
|
||||
%s1 = add i16 %arg1, %arg2
|
||||
%s2 = zext i16 %arg1 to i32
|
||||
%s3 = load i16, i16* %arg3
|
||||
store i16 %arg2, i16* %arg3
|
||||
%c = icmp eq i16 %arg1, %arg2
|
||||
|
||||
ret void
|
||||
; CHECK: cost of 1 {{.*}} add
|
||||
; CHECK: cost of 1 {{.*}} zext
|
||||
; CHECK: cost of 1 {{.*}} load
|
||||
; CHECK: cost of 1 {{.*}} store
|
||||
; CHECK: cost of 1 {{.*}} icmp
|
||||
; CHECK-P9: cost of 1 {{.*}} add
|
||||
; CHECK-P9: cost of 1 {{.*}} zext
|
||||
; CHECK-P9: cost of 1 {{.*}} load
|
||||
; CHECK-P9: cost of 1 {{.*}} store
|
||||
; CHECK-P9: cost of 1 {{.*}} icmp
|
||||
}
|
||||
|
||||
define void @test4xi16(<4 x i16> %arg1, <4 x i16> %arg2) {
|
||||
|
||||
%v1 = add <4 x i16> %arg1, %arg2
|
||||
%v2 = zext <4 x i16> %arg1 to <4 x i32>
|
||||
%v3 = shufflevector <4 x i16> %arg1, <4 x i16> undef, <4 x i32> zeroinitializer
|
||||
%c = icmp eq <4 x i16> %arg1, %arg2
|
||||
|
||||
ret void
|
||||
; CHECK: cost of 1 {{.*}} add
|
||||
; CHECK: cost of 1 {{.*}} zext
|
||||
; CHECK: cost of 1 {{.*}} shufflevector
|
||||
; CHECK: cost of 1 {{.*}} icmp
|
||||
; CHECK-P9: cost of 2 {{.*}} add
|
||||
; CHECK-P9: cost of 2 {{.*}} zext
|
||||
; CHECK-P9: cost of 2 {{.*}} shufflevector
|
||||
; CHECK-P9: cost of 2 {{.*}} icmp
|
||||
}
|
||||
|
||||
define void @test4xi32(<4 x i32> %arg1, <4 x i32> %arg2, <4 x i32>* %arg3) {
|
||||
|
||||
%v1 = load <4 x i32>, <4 x i32>* %arg3
|
||||
store <4 x i32> %arg2, <4 x i32>* %arg3
|
||||
|
||||
ret void
|
||||
; CHECK: cost of 1 {{.*}} load
|
||||
; CHECK: cost of 1 {{.*}} store
|
||||
; CHECK-P9: cost of 2 {{.*}} load
|
||||
; CHECK-P9: cost of 2 {{.*}} store
|
||||
}
|
||||
|
||||
define void @test2xdouble(<2 x double> %arg1) {
|
||||
%v1 = extractelement <2 x double> %arg1, i32 0
|
||||
%v2 = extractelement <2 x double> %arg1, i32 1
|
||||
|
||||
ret void
|
||||
; CHECK: cost of 0 {{.*}} extractelement
|
||||
; CHECK: cost of 1 {{.*}} extractelement
|
||||
; CHECK-P9: cost of 0 {{.*}} extractelement
|
||||
; CHECK-P9: cost of 2 {{.*}} extractelement
|
||||
; CHECK-LE-LABEL: test2xdouble
|
||||
; CHECK-LE: cost of 2 {{.*}} extractelement
|
||||
; CHECK-LE: cost of 0 {{.*}} extractelement
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P9
|
||||
; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P8
|
||||
|
||||
%struct._pp = type { i16, i16, i16, i16 }
|
||||
|
||||
; Function Attrs: norecurse nounwind readonly
|
||||
define [5 x double] @foo(double %k, i64 %n, %struct._pp* nocapture readonly %p) local_unnamed_addr #0 {
|
||||
entry:
|
||||
%cmp17 = icmp sgt i64 %n, 0
|
||||
br i1 %cmp17, label %for.body, label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body, %entry
|
||||
%retval.sroa.0.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
|
||||
%retval.sroa.4.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add10, %for.body ]
|
||||
%.fca.0.insert = insertvalue [5 x double] undef, double %retval.sroa.0.0.lcssa, 0
|
||||
%.fca.1.insert = insertvalue [5 x double] %.fca.0.insert, double %retval.sroa.4.0.lcssa, 1
|
||||
ret [5 x double] %.fca.1.insert
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%i.020 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
|
||||
%retval.sroa.4.019 = phi double [ %add10, %for.body ], [ 0.000000e+00, %entry ]
|
||||
%retval.sroa.0.018 = phi double [ %add, %for.body ], [ 0.000000e+00, %entry ]
|
||||
%r1 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 2
|
||||
%0 = load i16, i16* %r1, align 2
|
||||
%conv2 = uitofp i16 %0 to double
|
||||
%mul = fmul double %conv2, %k
|
||||
%add = fadd double %retval.sroa.0.018, %mul
|
||||
%g5 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 1
|
||||
%1 = load i16, i16* %g5, align 2
|
||||
%conv7 = uitofp i16 %1 to double
|
||||
%mul8 = fmul double %conv7, %k
|
||||
%add10 = fadd double %retval.sroa.4.019, %mul8
|
||||
%inc = add nuw nsw i64 %i.020, 1
|
||||
%exitcond = icmp eq i64 %inc, %n
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
||||
}
|
||||
|
||||
; CHECK-P8: load <2 x i16>
|
||||
; CHECK-P9-NOT: load <2 x i16>
|
Loading…
x
Reference in New Issue
Block a user