AMDGPU: Split AMDGPUTTI into GCNTTI and R600TTI

Reviewers: arsenm, nhaehnle

Reviewed By: arsenm

Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye

Differential Revision: https://reviews.llvm.org/D47359

llvm-svn: 333605
This commit is contained in:
Tom Stellard 2018-05-30 22:55:35 +00:00
parent c0190a4b57
commit d68071370f
4 changed files with 212 additions and 42 deletions

View File

@ -439,6 +439,11 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
return I.get();
}
TargetTransformInfo
R600TargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(R600TTIImpl(this, F));
}
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
@ -472,6 +477,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
return I.get();
}
TargetTransformInfo
GCNTargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(GCNTTIImpl(this, F));
}
//===----------------------------------------------------------------------===//
// AMDGPU Pass Setup
//===----------------------------------------------------------------------===//
@ -561,11 +571,6 @@ public:
} // end anonymous namespace
TargetTransformInfo
AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(AMDGPUTTIImpl(this, F));
}
void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
if (getOptLevel() == CodeGenOpt::Aggressive)
addPass(createGVNPass());

View File

@ -55,7 +55,6 @@ public:
const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
return &IntrinsicInfo;
}
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
@ -91,6 +90,8 @@ public:
const R600Subtarget *getSubtargetImpl(const Function &) const override;
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
bool isMachineVerifierClean() const override {
return false;
}
@ -114,6 +115,8 @@ public:
const SISubtarget *getSubtargetImpl(const Function &) const override;
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
bool useIPRA() const override {
return true;
}

View File

@ -211,32 +211,27 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
// The concept of vector registers doesn't really exist. Some packed vector
// operations operate on the normal 32-bit registers.
// Number of VGPRs on SI.
if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return 256;
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
return 256;
}
unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
// This is really the number of registers to fill when vectorizing /
// interleaving loops, so we lie to avoid trying to use all registers.
return getHardwareNumberOfRegisters(Vec) >> 3;
}
unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
return 32;
}
unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
return 32;
}
unsigned AMDGPUTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
unsigned VecRegBitWidth = VF * LoadSize;
@ -247,7 +242,7 @@ unsigned AMDGPUTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
return VF;
}
unsigned AMDGPUTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
unsigned VecRegBitWidth = VF * StoreSize;
@ -257,13 +252,11 @@ unsigned AMDGPUTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
return VF;
}
unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
AMDGPUAS AS = ST->getAMDGPUAS();
if (AddrSpace == AS.GLOBAL_ADDRESS ||
AddrSpace == AS.CONSTANT_ADDRESS ||
AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
return 128;
return 512;
}
@ -275,16 +268,10 @@ unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
if (AddrSpace == AS.PRIVATE_ADDRESS)
return 8 * ST->getMaxPrivateElementSize();
if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
(AddrSpace == AS.PARAM_D_ADDRESS ||
AddrSpace == AS.PARAM_I_ADDRESS ||
(AddrSpace >= AS.CONSTANT_BUFFER_0 &&
AddrSpace <= AS.CONSTANT_BUFFER_15)))
return 128;
llvm_unreachable("unhandled address space");
}
bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
// We allow vectorization of flat stores, even though we may need to decompose
@ -297,19 +284,19 @@ bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
return true;
}
bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.
if (VF == 1)
@ -318,7 +305,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 8;
}
bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
@ -347,7 +334,7 @@ bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
}
}
int AMDGPUTTIImpl::getArithmeticInstrCost(
int GCNTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
@ -457,7 +444,7 @@ int AMDGPUTTIImpl::getArithmeticInstrCost(
Opd1PropInfo, Opd2PropInfo);
}
unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
// XXX - For some reason this isn't called for switch.
switch (Opcode) {
case Instruction::Br:
@ -468,7 +455,7 @@ unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
}
}
int AMDGPUTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwise) {
EVT OrigTy = TLI->getValueType(DL, Ty);
@ -483,7 +470,7 @@ int AMDGPUTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
return LT.first * getFullRateInstrCost();
}
int AMDGPUTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
bool IsPairwise,
bool IsUnsigned) {
EVT OrigTy = TLI->getValueType(DL, Ty);
@ -499,7 +486,7 @@ int AMDGPUTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
return LT.first * getHalfRateInstrCost();
}
int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
@ -554,7 +541,7 @@ static bool isArgPassedInSGPR(const Argument *A) {
/// \returns true if the result of the value could potentially be
/// different across workitems in a wavefront.
bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (const Argument *A = dyn_cast<Argument>(V))
return !isArgPassedInSGPR(A);
@ -584,7 +571,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
return false;
}
bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
switch (Intrinsic->getIntrinsicID()) {
default:
@ -597,7 +584,7 @@ bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
}
unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (ST->hasVOP3PInsts()) {
VectorType *VT = cast<VectorType>(Tp);
@ -620,7 +607,7 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Inde
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
@ -632,3 +619,114 @@ bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
}
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
CommonTTI.getUnrollingPreferences(L, SE, UP);
}
unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
return getHardwareNumberOfRegisters(Vec);
}
unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
return 32;
}
unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
return 32;
}
unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
AMDGPUAS AS = ST->getAMDGPUAS();
if (AddrSpace == AS.GLOBAL_ADDRESS ||
AddrSpace == AS.CONSTANT_ADDRESS)
return 128;
if (AddrSpace == AS.LOCAL_ADDRESS ||
AddrSpace == AS.REGION_ADDRESS)
return 64;
if (AddrSpace == AS.PRIVATE_ADDRESS)
return 32;
if ((AddrSpace == AS.PARAM_D_ADDRESS ||
AddrSpace == AS.PARAM_I_ADDRESS ||
(AddrSpace >= AS.CONSTANT_BUFFER_0 &&
AddrSpace <= AS.CONSTANT_BUFFER_15)))
return 128;
llvm_unreachable("unhandled address space");
}
bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
// We allow vectorization of flat stores, even though we may need to decompose
// them later if they may access private memory. We don't have enough context
// here, and legalization can handle it.
if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
return false;
return true;
}
bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.
if (VF == 1)
return 1;
return 8;
}
unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
// XXX - For some reason this isn't called for switch.
switch (Opcode) {
case Instruction::Br:
case Instruction::Ret:
return 10;
default:
return BaseT::getCFInstrCost(Opcode);
}
}
int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::InsertElement: {
unsigned EltSize
= DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
if (EltSize < 32) {
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
// Extracts are just reads of a subregister, so are free. Inserts are
// considered free because we don't want to have any cost for scalarizing
// operations, and we don't have to copy into a different register class.
// Dynamic indexing isn't free and is best avoided.
return Index == ~0u ? 2 : 0;
}
default:
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
}
void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
CommonTTI.getUnrollingPreferences(L, SE, UP);
}

View File

@ -47,6 +47,29 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
const AMDGPUSubtarget *ST;
const AMDGPUTargetLowering *TLI;
public:
explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),
ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
const AMDGPUSubtarget *getST() const { return ST; }
const AMDGPUTargetLowering *getTLI() const { return TLI; }
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
};
class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
using BaseT = BasicTTIImplBase<GCNTTIImpl>;
using TTI = TargetTransformInfo;
friend BaseT;
const AMDGPUSubtarget *ST;
const AMDGPUTargetLowering *TLI;
AMDGPUTTIImpl CommonTTI;
bool IsGraphicsShader;
const FeatureBitset InlineFeatureIgnoreList = {
@ -99,10 +122,11 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
}
public:
explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),
ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()),
CommonTTI(TM, F),
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
bool hasBranchDivergence() { return true; }
@ -182,6 +206,46 @@ public:
bool IsUnsigned);
};
class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
using BaseT = BasicTTIImplBase<R600TTIImpl>;
using TTI = TargetTransformInfo;
friend BaseT;
const AMDGPUSubtarget *ST;
const AMDGPUTargetLowering *TLI;
AMDGPUTTIImpl CommonTTI;
public:
explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()),
ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()),
CommonTTI(TM, F) {}
const AMDGPUSubtarget *getST() const { return ST; }
const AMDGPUTargetLowering *getTLI() const { return TLI; }
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
unsigned getHardwareNumberOfRegisters(bool Vec) const;
unsigned getNumberOfRegisters(bool Vec) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
unsigned AddrSpace) const;
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const;
unsigned getMaxInterleaveFactor(unsigned VF);
unsigned getCFInstrCost(unsigned Opcode);
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
};
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H