From dca409d5ad67b8e4b701a83dad62ea6595885ca2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 27 Jun 2016 20:32:13 +0000 Subject: [PATCH] AMDGPU: Move subtarget feature checks into passes git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273937 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPU.td | 6 --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 6 ++- lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 - lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 44 +++++++++++++------- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 3 ++ test/CodeGen/AMDGPU/captured-frame-index.ll | 2 +- test/CodeGen/AMDGPU/cgp-addressing-modes.ll | 6 +-- test/CodeGen/AMDGPU/extload-private.ll | 6 +-- test/CodeGen/AMDGPU/parallelandifcollapse.ll | 3 +- test/CodeGen/AMDGPU/structurize1.ll | 2 +- 11 files changed, 46 insertions(+), 38 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 7736fd6c4cf..607e8d9bfdd 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -241,12 +241,6 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature < "Force using DS instruction immediate offsets on SI" >; -def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", - "EnableIfCvt", - "false", - "Disable the if conversion pass" ->; - def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", "EnableSIScheduler", "true", diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 17b45fa65f1..fa8709e4f2b 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -124,6 +124,10 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { if (!TM || skipFunction(F)) return false; + const AMDGPUSubtarget &ST = TM->getSubtarget(F); + if (!ST.isPromoteAllocaEnabled()) + return false; + FunctionType *FTy = F.getFunctionType(); // If the function has any arguments in the local address space, then it's @@ -139,8 +143,6 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { } } - const AMDGPUSubtarget &ST = TM->getSubtarget(F); - LocalMemLimit = ST.getLocalMemorySize(); if (LocalMemLimit == 0) return false; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index e973f8e4837..39032b682e1 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -105,7 +105,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableVGPRSpilling(false), EnablePromoteAlloca(false), - EnableIfCvt(true), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 53117e3cb60..9a0adf1b166 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -82,7 +82,6 @@ protected: // Used as options. bool EnableVGPRSpilling; bool EnablePromoteAlloca; - bool EnableIfCvt; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; @@ -222,10 +221,6 @@ public: return EnablePromoteAlloca; } - bool isIfCvtEnabled() const { - return EnableIfCvt; - } - bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 54a28fde83f..162bbc2f91c 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -45,6 +45,18 @@ static cl::opt EnableR600StructurizeCFG( cl::desc("Use StructurizeCFG IR pass"), cl::init(true)); +static cl::opt EnableSROA( + "amdgpu-sroa", + cl::desc("Run SROA after promote alloca pass"), + cl::ReallyHidden, + cl::init(true)); + +static cl::opt EnableR600IfConvert( + "r600-if-convert", + cl::desc("Use if conversion pass"), + cl::ReallyHidden, + cl::init(true)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(TheAMDGPUTarget); @@ -212,12 +224,7 @@ public: } ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const override { - const SISubtarget *ST = getGCNTargetMachine().getSubtargetImpl(); - if (ST->enableSIScheduler()) - return createSIMachineScheduler(C); - return nullptr; - } + createMachineScheduler(MachineSchedContext *C) const override; bool addPreISel() override; void addMachineSSAOptimization() override; @@ -285,10 +292,11 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPUOpenCLImageTypeLoweringPass()); const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); - const AMDGPUSubtarget &ST = *TM.getSubtargetImpl(); - if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) { + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createAMDGPUPromoteAlloca(&TM)); - addPass(createSROAPass()); + + if (EnableSROA) + addPass(createSROAPass()); } addStraightLineScalarOptimizationPasses(); @@ -344,9 +352,8 @@ void R600PassConfig::addPreRegAlloc() { } void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) + if (EnableR600IfConvert) addPass(&IfConverterID, false); addPass(createR600ClauseMergePass(*TM), false); } @@ -367,6 +374,14 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { // GCN Pass Setup //===----------------------------------------------------------------------===// +ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( + MachineSchedContext *C) const { + const SISubtarget &ST = C->MF->getSubtarget(); + if (ST.enableSIScheduler()) + return createSIMachineScheduler(C); + return nullptr; +} + bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); @@ -415,8 +430,6 @@ bool GCNPassConfig::addRegBankSelect() { #endif void GCNPassConfig::addPreRegAlloc() { - const SISubtarget &ST = *getGCNTargetMachine().getSubtargetImpl(); - // This needs to be run directly before register allocation because // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass @@ -424,15 +437,18 @@ void GCNPassConfig::addPreRegAlloc() { insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + if (getOptLevel() > CodeGenOpt::None) { // Don't do this with no optimizations since it throws away debug info by // merging nonadjacent loads. // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. + + // FIXME: Move pre-RA and remove extra reg coalescer run. insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } + addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); } diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 0b9b29a54b5..9e972a569a0 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -412,6 +412,9 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { return false; const SISubtarget &STM = MF.getSubtarget(); + if (!STM.loadStoreOptEnabled()) + return false; + TII = STM.getInstrInfo(); TRI = &TII->getRegisterInfo(); diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll index 978b6da7b7c..161c46b486e 100644 --- a/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index ae419a6a353..a0857273e3e 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; OPT-LABEL: @test_sink_global_small_offset_i32( ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll index 294c3a9c678..3f27370d703 100644 --- a/test/CodeGen/AMDGPU/extload-private.ll +++ b/test/CodeGen/AMDGPU/extload-private.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: ; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen @@ -39,7 +39,7 @@ entry: define void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 - %tmp1 = load i16, i16* %tmp0 + %tmp1 = load volatile i16, i16* %tmp0 %tmp2 = zext i16 %tmp1 to i32 store i32 %tmp2, i32 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll index f32b044198a..ea943a533c8 100644 --- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -1,5 +1,4 @@ -; Function Attrs: nounwind -; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck %s ; ; CFG flattening should use parallel-and mode to generate branch conditions and ; then merge if-regions with the same bodies. diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll index 77432c1f9d2..db0f50247e3 100644 --- a/test/CodeGen/AMDGPU/structurize1.ll +++ b/test/CodeGen/AMDGPU/structurize1.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -r600-if-convert=0 < %s | FileCheck %s ; This tests for abug where the AMDILCFGStructurizer was crashing on loops ; like this: