From a4d045ea56bfde337964974c7b5f1e7b08cdf907 Mon Sep 17 00:00:00 2001 From: Alexander Timofeev Date: Wed, 25 Apr 2018 12:32:46 +0000 Subject: [PATCH] [AMDGPU] Revert b0efc4fd6 (https://reviews.llvm.org/D40556) llvm-svn: 330818 --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 79 ++++--------------- .../AMDGPU/control-flow-fastregalloc.ll | 7 +- test/CodeGen/AMDGPU/uniform-PHI.ll | 39 --------- .../AMDGPU/uniform-loop-inside-nonuniform.ll | 4 +- test/CodeGen/AMDGPU/valu-i1.ll | 4 +- 5 files changed, 24 insertions(+), 109 deletions(-) delete mode 100644 test/CodeGen/AMDGPU/uniform-PHI.ll diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 8b155c2d278..e26bc99bd4b 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -81,7 +81,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" @@ -110,12 +109,7 @@ namespace { class SIFixSGPRCopies : public MachineFunctionPass { MachineDominatorTree *MDT; - MachinePostDominatorTree *MPDT; - DenseMap> PDF; - void computePDF(MachineFunction * MF); -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void printPDF(); -#endif + public: static char ID; @@ -128,8 +122,6 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -417,6 +409,12 @@ bool searchPredecessors(const MachineBasicBlock *MBB, return false; } +static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, + const TargetRegisterInfo *TRI) { + return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { + return hasTerminatorThatModifiesExec(*MBB, *TRI); }); +} + // Checks if there is potential path From instruction To instruction. // If CutOff is specified and it sits in between of that path we ignore // a higher portion of the path and report it is not reachable. @@ -567,47 +565,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, return Changed; } -void SIFixSGPRCopies::computePDF(MachineFunction *MF) { - MachineFunction::iterator B = MF->begin(); - MachineFunction::iterator E = MF->end(); - for (; B != E; ++B) { - if (B->succ_size() > 1) { - for (auto S : B->successors()) { - MachineDomTreeNode *runner = MPDT->getNode(&*S); - MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom(); - while (runner && runner != sentinel) { - PDF[runner->getBlock()].insert(&*B); - runner = runner->getIDom(); - } - } - } - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void SIFixSGPRCopies::printPDF() { - dbgs() << "\n######## PostDominanceFrontiers set #########\n"; - for (auto &I : PDF) { - dbgs() << "PDF[ " << I.first->getNumber() << "] : "; - for (auto &J : I.second) { - dbgs() << J->getNumber() << ' '; - } - dbgs() << '\n'; - } - dbgs() << "\n##############################################\n"; -} -#endif - bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); MDT = &getAnalysis(); - MPDT = &getAnalysis(); - PDF.clear(); - computePDF(&MF); - DEBUG(printPDF()); SmallVector Worklist; @@ -661,27 +624,15 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; - // We don't need to fix the PHI if all the source blocks - // have no divergent control dependecies + // We don't need to fix the PHI if the common dominator of the + // two incoming blocks terminates with a uniform branch. bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII); - if (!HasVGPROperand) { - bool Uniform = true; - MachineBasicBlock * Join = MI.getParent(); - for (auto &O : MI.explicit_operands()) { - if (O.isMBB()) { - MachineBasicBlock * Source = O.getMBB(); - SetVector &SourcePDF = PDF[Source]; - SetVector &JoinPDF = PDF[Join]; - SetVector CDList; - for (auto &I : SourcePDF) { - if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) { - if (hasTerminatorThatModifiesExec(*I, *TRI)) - Uniform = false; - } - } - } - } - if (Uniform) { + if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) { + MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); + MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); + + if (!predsHasDivergentTerminator(MBB0, TRI) && + !predsHasDivergentTerminator(MBB1, TRI)) { DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); break; } diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index c51b8e0efa0..f8a0de6663b 100644 --- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -90,7 +90,7 @@ endif: } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 12{{$}} +; VGPR: workitem_private_segment_byte_size = 16{{$}} ; GCN: {{^}}; %bb.0: @@ -124,9 +124,10 @@ endif: ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] -; GCN: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN: v_cmp_ne_u32_e32 vcc, +; GCN: s_and_b64 vcc, exec, vcc ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN-NEXT: s_cbranch_scc1 [[LOOP]] +; GCN-NEXT: s_cbranch_vccnz [[LOOP]] ; GCN: [[END]]: diff --git a/test/CodeGen/AMDGPU/uniform-PHI.ll b/test/CodeGen/AMDGPU/uniform-PHI.ll deleted file mode 100644 index 3cb86b39a65..00000000000 --- a/test/CodeGen/AMDGPU/uniform-PHI.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: BB0_2 -; GCN-NOT: v_readfirstlane - - -target triple = "amdgcn--amdhsa" -define amdgpu_kernel void @uniform-PHI(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { -bb: - %tmp = sext i32 %arg2 to i64 - %tmp3 = tail call i64 @_Z13get_global_idj(i32 0) #2 - %tmp4 = icmp ugt i64 %tmp3, %tmp - %tmp5 = icmp sgt i32 %arg2, 0 - %tmp6 = and i1 %tmp4, %tmp5 - br i1 %tmp6, label %bb7, label %bb17 - -bb7: ; preds = %bb - br label %bb8 - -bb8: ; preds = %bb8, %bb7 - %tmp9 = phi i32 [ %tmp15, %bb8 ], [ 0, %bb7 ] - %tmp10 = phi i32 [ %tmp14, %bb8 ], [ 0, %bb7 ] - %tmp11 = zext i32 %tmp9 to i64 - %tmp12 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp11 - %tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4 - %tmp14 = add nsw i32 %tmp13, %tmp10 - %tmp15 = add nuw nsw i32 %tmp9, 1 - %tmp16 = icmp eq i32 %tmp15, %arg2 - br i1 %tmp16, label %bb17, label %bb8 - -bb17: ; preds = %bb8, %bb - %tmp18 = phi i32 [ 0, %bb ], [ %tmp14, %bb8 ] - store i32 %tmp18, i32 addrspace(1)* %arg1, align 4 - ret void -} - -declare i64 @_Z13get_global_idj(i32) local_unnamed_addr #1 -attributes #1 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "target-features"="+16-bit-insts,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { convergent nounwind readnone } diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll index 1bbda66fddb..82283f39792 100644 --- a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -6,10 +6,11 @@ ; CHECK: v_cmp_ne_u32_e32 vcc, 0 ; CHECK: s_and_saveexec_b64 ; CHECK-NEXT: ; mask branch +; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}} ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: -; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]] +; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]] ; CHECK: s_endpgm define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) { @@ -34,6 +35,7 @@ out: ; CHECK-LABEL: {{^}}test2: ; CHECK: s_and_saveexec_b64 ; CHECK-NEXT: ; mask branch +; CHECK-NEXT: s_cbranch_execz define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { main_body: %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll index 4a3937e44f3..1d0856b47e0 100644 --- a/test/CodeGen/AMDGPU/valu-i1.ll +++ b/test/CodeGen/AMDGPU/valu-i1.ll @@ -162,8 +162,8 @@ exit: ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword -; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100 -; SI: s_cbranch_scc0 [[LABEL_LOOP]] +; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100 +; SI: s_cbranch_vccz [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm