[AMDGPU] Add a new pass to insert waitcnts. Leave under an option for testing.

Based on comments in https://reviews.llvm.org/D31161.

llvm-svn: 300023
This commit is contained in:
Kannan Narayanan 2017-04-12 03:25:12 +00:00
parent 046cd27262
commit fd367b26fb
7 changed files with 1881 additions and 6 deletions

View File

@ -48,6 +48,7 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
@ -125,6 +126,9 @@ extern char &SIDebuggerInsertNopsID;
void initializeSIInsertWaitsPass(PassRegistry&);
extern char &SIInsertWaitsID;
void initializeSIInsertWaitcntsPass(PassRegistry&);
extern char &SIInsertWaitcntsID;
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
extern char &AMDGPUUnifyDivergentExitNodesID;

View File

@ -112,6 +112,12 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
cl::init(true));
// Option to enable new waitcnt insertion pass.
static cl::opt<bool> EnableSIInsertWaitcntsPass(
"enable-si-insert-waitcnts",
cl::desc("Use new waitcnt insertion pass"),
cl::init(false));
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@ -134,6 +140,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertWaitsPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
@ -810,7 +817,10 @@ void GCNPassConfig::addPreEmitPass() {
// cases.
addPass(&PostRAHazardRecognizerID);
addPass(createSIInsertWaitsPass());
if (EnableSIInsertWaitcntsPass)
addPass(createSIInsertWaitcntsPass());
else
addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
addPass(&SIInsertSkipsPassID);
addPass(createSIDebuggerInsertNopsPass());

View File

@ -82,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
SIFrameLowering.cpp
SIInsertSkips.cpp
SIInsertWaits.cpp
SIInsertWaitcnts.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
SILoadStoreOptimizer.cpp

View File

@ -253,6 +253,8 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
[(set i32:$vdst,
(node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
let LGKM_CNT = 0;
let mayLoad = 0;
let mayStore = 0;
let isConvergent = 1;

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,6 @@ declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
; FUNC-LABEL: {{^}}ds_bpermute:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CHECK: s_waitcnt lgkmcnt
define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
store i32 %bpermute, i32 addrspace(1)* %out, align 4
@ -13,7 +12,6 @@ define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %
; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
; CHECK: s_waitcnt lgkmcnt
define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
%index = add i32 %base_index, 4
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
@ -23,7 +21,6 @@ define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %b
; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
; CHECK: s_waitcnt lgkmcnt
define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
store i32 %bpermute, i32 addrspace(1)* %out, align 4

View File

@ -4,7 +4,6 @@ declare i32 @llvm.amdgcn.ds.permute(i32, i32) #0
; CHECK-LABEL: {{^}}ds_permute:
; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CHECK: s_waitcnt lgkmcnt
define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
%bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
store i32 %bpermute, i32 addrspace(1)* %out, align 4
@ -13,7 +12,6 @@ define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %s
; CHECK-LABEL: {{^}}ds_permute_imm_offset:
; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
; CHECK: s_waitcnt lgkmcnt
define amdgpu_kernel void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
%index = add i32 %base_index, 4
%bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0