[AMDGPU] Make note of existing waitcnt instrs; this is add-on work related to suppression of redundant waitcnt instrs. It is necessary to make note of these existing waitcnt instrs so that we do not fall into an infinite loop when handling loops. Also, [NFC] some minor code clean-up.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@325524 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Mark Searles 2018-02-19 19:19:59 +00:00
parent 176cbb368e
commit 04c7451aa8
2 changed files with 28 additions and 29 deletions

View File

@ -1125,7 +1125,8 @@ void SIInsertWaitcnts::generateSWaitCntInstBefore(
if (EmitSwaitcnt != 0) {
MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
if (!OldWaitcnt ||
(AMDGPU::decodeVmcnt(IV, Imm) !=
(CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
(AMDGPU::decodeExpcnt(IV, Imm) !=
(CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
@ -1143,7 +1144,7 @@ void SIInsertWaitcnts::generateSWaitCntInstBefore(
ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
}
ScoreBracket->setRevisitLoop(true);
DEBUG(dbgs() << "set-revisit: block"
DEBUG(dbgs() << "set-revisit: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);
}
}
@ -1151,18 +1152,14 @@ void SIInsertWaitcnts::generateSWaitCntInstBefore(
// Update an existing waitcount, or make a new one.
unsigned Enc = AMDGPU::encodeWaitcnt(IV, CntVal[VM_CNT],
CntVal[EXP_CNT], CntVal[LGKM_CNT]);
// We don't (yet) track waitcnts that existed prior to the waitcnt
// pass (we just skip over them); because the waitcnt pass is ignorant
// of them, it may insert a redundant waitcnt. To avoid this, check
// the prev instr. If it and the to-be-inserted waitcnt are the
// same, keep the prev waitcnt and skip the insertion. We assume that
// whomever. e.g., for memory model, inserted the prev waitcnt really
// wants it there.
// We don't remove waitcnts that existed prior to the waitcnt
// pass. Check if the waitcnt to-be-inserted can be avoided
// or if the prev waitcnt can be updated.
bool insertSWaitInst = true;
for (MachineBasicBlock::iterator I = MI.getIterator(),
B = MI.getParent()->begin();
insertSWaitInst && I != B; --I) {
if (I == MI.getIterator())
if (I == MI.getIterator())
continue;
switch (I->getOpcode()) {
@ -1585,15 +1582,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineInstr &Inst = *Iter;
// Remove any previously existing waitcnts.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
// TODO: Register the old waitcnt and optimize the following waitcnts.
// Leaving the previously existing waitcnts is conservatively correct.
// Leave pre-existing waitcnts, but note their existence via setWaitcnt.
// Remove the waitcnt-pass-generated waitcnts; the pass will add them back
// as needed.
if (!TrackedWaitcntSet.count(&Inst))
++Iter;
else {
ScoreBrackets->setWaitcnt(&Inst);
++Iter;
Inst.removeFromParent();
}
ScoreBrackets->setWaitcnt(&Inst);
continue;
}
@ -1684,8 +1682,8 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (WaitcntData->getIterCnt() > 2) {
// To ensure convergence, need to make wait events at loop footer be no
// more than those from the previous iteration.
// As a simplification, Instead of tracking individual scores and
// generate the precise wait count, just wait on 0.
// As a simplification, instead of tracking individual scores and
// generating the precise wait count, just wait on 0.
bool HasPending = false;
MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@ -1722,7 +1720,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Add this waitcnt to the block. It is either newly created or
// created in previous iterations and added back since block traversal
// always remove waitcnt.
// always removes waitcnts.
insertWaitcntBeforeCF(Block, SWaitInst);
WaitcntData->setWaitcnt(SWaitInst);
}
@ -1788,7 +1786,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
(!BlockWaitcntProcessedSet.count(&MBB))) {
BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
DEBUG(dbgs() << "set-revisit: block"
DEBUG(dbgs() << "set-revisit: Block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);
}
@ -1819,7 +1817,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
WaitcntData->incIterCnt();
DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
continue;
} else {
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();

View File

@ -1,24 +1,25 @@
# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s
# Check that the waitcnt pass does *not* insert a redundant waitcnt instr.
# In this testcase, ensure that pass does not insert redundant S_WAITCNT 127
# or S_WAITCNT 3952
# In this testcase, ensure that pass does not insert redundant S_WAITCNT 3952
...
# CHECK-LABEL: name: waitcnt-no-redundant
# CHECK: DS_READ_B64
# CHECK-NEXT: S_WAITCNT 127
# CHECK: S_WAITCNT 3952
# CHECK-NEXT: FLAT_ATOMIC_CMPSWAP
# CHECK-NEXT: S_WAITCNT 3952
# CHECK-NEXT: BUFFER_WBINVL1_VOL
# CHECK-NEXT: BUFFER_WBINVL1
name: waitcnt-no-redundant
body: |
bb.0:
renamable $vgpr0_vgpr1 = DS_READ_B64 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
S_WAITCNT 127
FLAT_ATOMIC_CMPSWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr3_vgpr4, 0, 0, implicit $exec, implicit $flat_scr
bb.0:
renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
bb.1:
S_WAITCNT 3952
BUFFER_WBINVL1_VOL implicit $exec
S_ENDPGM
FLAT_ATOMIC_CMPSWAP undef renamable $vgpr0_vgpr1, renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
S_WAITCNT 3952
BUFFER_WBINVL1 implicit $exec
S_BRANCH %bb.1
...