AMDGPU: Split SILowerControlFlow into two pieces

Do most of the lowering in a pre-RA pass. Keep the skip jump
insertion late, plus a few other things that require more
work to move out.

One concern I have is now there may be COPY instructions
which do not have the necessary implicit exec uses
if they will be lowered to v_mov_b32.

This has a positive effect on SGPR usage in shader-db.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@279464 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matt Arsenault 2016-08-22 19:33:16 +00:00
parent ce35dd29a5
commit 7517ed227a
8 changed files with 561 additions and 371 deletions

View File

@ -39,7 +39,6 @@ FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSILowerControlFlowPass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIDebuggerInsertNopsPass();
@ -69,8 +68,10 @@ void initializeSIWholeQuadModePass(PassRegistry &);
extern char &SIWholeQuadModeID;
void initializeSILowerControlFlowPass(PassRegistry &);
extern char &SILowerControlFlowPassID;
extern char &SILowerControlFlowID;
void initializeSIInsertSkipsPass(PassRegistry &);
extern char &SIInsertSkipsPassID;
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);

View File

@ -80,6 +80,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIInsertWaitsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);
}
@ -532,13 +533,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
#endif
void GCNPassConfig::addPreRegAlloc() {
// This needs to be run directly before register allocation because
// earlier passes might recompute live intervals.
// TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
if (getOptLevel() > CodeGenOpt::None) {
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
}
if (getOptLevel() > CodeGenOpt::None) {
// Don't do this with no optimizations since it throws away debug info by
// merging nonadjacent loads.
@ -556,10 +550,22 @@ void GCNPassConfig::addPreRegAlloc() {
}
void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
// FIXME: We have to disable the verifier here because of PHIElimination +
// TwoAddressInstructions disabling it.
insertPass(&TwoAddressInstructionPassID, &SILowerControlFlowID, false);
TargetPassConfig::addFastRegAlloc(RegAllocPass);
}
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
// This needs to be run directly before register allocation because earlier
// passes might recompute live intervals.
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
// TODO: It might be better to run this right after phi elimination, but for
// now that would require not running the verifier.
insertPass(&RenameIndependentSubregsID, &SILowerControlFlowID);
TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
@ -579,7 +585,7 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
addPass(createSILowerControlFlowPass());
addPass(&SIInsertSkipsPassID);
addPass(createSIDebuggerInsertNopsPass());
}

View File

@ -67,6 +67,7 @@ add_llvm_target(AMDGPUCodeGen
SIFixSGPRCopies.cpp
SIFoldOperands.cpp
SIFrameLowering.cpp
SIInsertSkips.cpp
SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp

View File

@ -0,0 +1,330 @@
//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief This pass inserts branches on the 0 exec mask over divergent branches
/// branches when it's expected that jumping over the untaken control flow will
/// be cheaper than having every workitem no-op through it.
//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/MC/MCAsmInfo.h"
using namespace llvm;
#define DEBUG_TYPE "si-insert-skips"
namespace {
static cl::opt<unsigned> SkipThresholdFlag(
"amdgpu-skip-threshold",
cl::desc("Number of instructions before jumping over divergent control flow"),
cl::init(12), cl::Hidden);
class SIInsertSkips : public MachineFunctionPass {
private:
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
unsigned SkipThreshold;
bool shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
void kill(MachineInstr &MI);
MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const;
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
public:
static char ID;
SIInsertSkips() :
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
return "SI insert s_cbranch_execz instructions";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // End anonymous namespace
char SIInsertSkips::ID = 0;
INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
static bool opcodeEmitsNoInsts(unsigned Opc) {
switch (Opc) {
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
case TargetOpcode::BUNDLE:
case TargetOpcode::CFI_INSTRUCTION:
case TargetOpcode::EH_LABEL:
case TargetOpcode::GC_LABEL:
case TargetOpcode::DBG_VALUE:
return true;
default:
return false;
}
}
bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const {
if (From.succ_empty())
return false;
unsigned NumInstr = 0;
const MachineFunction *MF = From.getParent();
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
const MachineBasicBlock &MBB = *MBBI;
for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
NumInstr < SkipThreshold && I != E; ++I) {
if (opcodeEmitsNoInsts(I->getOpcode()))
continue;
// FIXME: Since this is required for correctness, this should be inserted
// during SILowerControlFlow.
// When a uniform loop is inside non-uniform control flow, the branch
// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
// when EXEC = 0. We should skip the loop lest it becomes infinite.
if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
return true;
if (I->isInlineAsm()) {
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
const char *AsmStr = I->getOperand(0).getSymbolName();
// inlineasm length estimate is number of bytes assuming the longest
// instruction.
uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
NumInstr += MaxAsmSize / MAI->getMaxInstLength();
} else {
++NumInstr;
}
if (NumInstr >= SkipThreshold)
return true;
}
}
return false;
}
bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction *MF = MBB.getParent();
if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
!shouldSkip(MBB, MBB.getParent()->back()))
return false;
MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
const DebugLoc &DL = MI.getDebugLoc();
// If the exec mask is non-zero, skip the next two instructions
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addMBB(&NextBB);
MachineBasicBlock::iterator Insert = SkipBB->begin();
// Exec mask is zero: Export to NULL target...
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
.addImm(0)
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
.addImm(0)
.addImm(1)
.addImm(1)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef);
// ... and terminate wavefront.
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
return true;
}
void SIInsertSkips::kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Op = MI.getOperand(0);
#ifndef NDEBUG
CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
// Kill is only allowed in pixel / geometry shaders.
assert(CallConv == CallingConv::AMDGPU_PS ||
CallConv == CallingConv::AMDGPU_GS);
#endif
// Clear this thread from the exec mask if the operand is negative.
if (Op.isImm()) {
// Constant operand: Set exec mask to 0 or do nothing
if (Op.getImm() & 0x80000000) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addImm(0);
}
} else {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
.addImm(0)
.addOperand(Op);
}
}
MachineBasicBlock *SIInsertSkips::insertSkipBlock(
MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
MachineFunction *MF = MBB.getParent();
MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MF->insert(MBBI, SkipBB);
MBB.addSuccessor(SkipBB);
return SkipBB;
}
// Returns true if a branch over the block was inserted.
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {
MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
return false;
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
.addMBB(DestBB);
return true;
}
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
SkipThreshold = SkipThresholdFlag;
bool HaveKill = false;
bool MadeChange = false;
// Track depth of exec mask, divergent branches.
SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
MachineFunction::iterator NextBB;
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; BI = NextBB) {
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
// Reached convergence point for last divergent branch.
ExecBranchStack.pop_back();
}
if (HaveKill && ExecBranchStack.empty()) {
HaveKill = false;
// TODO: Insert skip if exec is 0?
}
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
case AMDGPU::SI_MASK_BRANCH: {
ExecBranchStack.push_back(MI.getOperand(0).getMBB());
MadeChange |= skipMaskBranch(MI, MBB);
break;
}
case AMDGPU::S_BRANCH: {
// Optimize out branches to the next block.
// FIXME: Shouldn't this be handled by BranchFolding?
if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
MI.eraseFromParent();
break;
}
case AMDGPU::SI_KILL_TERMINATOR: {
MadeChange = true;
kill(MI);
if (ExecBranchStack.empty()) {
if (skipIfDead(MI, *NextBB)) {
NextBB = std::next(BI);
BE = MF.end();
Next = MBB.end();
}
} else {
HaveKill = true;
}
MI.eraseFromParent();
break;
}
case AMDGPU::SI_RETURN: {
// FIXME: Should move somewhere else
assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
// because external bytecode will be appended at the end.
if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
// SI_RETURN is not the last instruction. Add an empty block at
// the end and jump there.
if (!EmptyMBBAtEnd) {
EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
MF.insert(MF.end(), EmptyMBBAtEnd);
}
MBB.addSuccessor(EmptyMBBAtEnd);
BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
.addMBB(EmptyMBBAtEnd);
I->eraseFromParent();
}
}
default:
break;
}
}
}
return MadeChange;
}

View File

@ -1807,6 +1807,7 @@ def SI_MASK_BRANCH : PseudoInstSI <
let isTerminator = 1;
let isBarrier = 0;
let SALU = 1;
let Uses = [EXEC];
}
let Uses = [EXEC], Defs = [EXEC, SCC] in {

View File

@ -58,8 +58,6 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/MC/MCAsmInfo.h"
using namespace llvm;
@ -67,46 +65,41 @@ using namespace llvm;
namespace {
static cl::opt<unsigned> SkipThresholdFlag(
"amdgpu-skip-threshold",
cl::desc("Number of instructions before jumping over divergent control flow"),
cl::init(12), cl::Hidden);
class SILowerControlFlow : public MachineFunctionPass {
private:
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
unsigned SkipThreshold;
LiveIntervals *LIS;
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);
void emitBreak(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
void emitElseBreak(MachineInstr &MI);
void emitLoop(MachineInstr &MI);
void emitEndCf(MachineInstr &MI);
MachineInstr *Skip(MachineInstr &From, MachineOperand &To);
bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
void If(MachineInstr &MI);
void Else(MachineInstr &MI);
void Break(MachineInstr &MI);
void IfBreak(MachineInstr &MI);
void ElseBreak(MachineInstr &MI);
void Loop(MachineInstr &MI);
void EndCf(MachineInstr &MI);
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const;
public:
static char ID;
SILowerControlFlow() :
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
MachineFunctionPass(ID),
TRI(nullptr),
TII(nullptr),
LIS(nullptr) {}
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
return "SI Lower control flow pseudo instructions";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addPreserved<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // End anonymous namespace
@ -114,403 +107,236 @@ public:
char SILowerControlFlow::ID = 0;
INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
"SI lower control flow", false, false)
"SI lower control flow", false, false)
char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
FunctionPass *llvm::createSILowerControlFlowPass() {
return new SILowerControlFlow();
}
static bool opcodeEmitsNoInsts(unsigned Opc) {
switch (Opc) {
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
case TargetOpcode::BUNDLE:
case TargetOpcode::CFI_INSTRUCTION:
case TargetOpcode::EH_LABEL:
case TargetOpcode::GC_LABEL:
case TargetOpcode::DBG_VALUE:
return true;
default:
return false;
}
}
bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
MachineBasicBlock *To) {
if (From->succ_empty())
return false;
unsigned NumInstr = 0;
MachineFunction *MF = From->getParent();
for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
MachineBasicBlock &MBB = *MBBI;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
NumInstr < SkipThreshold && I != E; ++I) {
if (opcodeEmitsNoInsts(I->getOpcode()))
continue;
// When a uniform loop is inside non-uniform control flow, the branch
// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
// when EXEC = 0. We should skip the loop lest it becomes infinite.
if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
return true;
if (I->isInlineAsm()) {
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
const char *AsmStr = I->getOperand(0).getSymbolName();
// inlineasm length estimate is number of bytes assuming the longest
// instruction.
uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
NumInstr += MaxAsmSize / MAI->getMaxInstLength();
} else {
++NumInstr;
}
if (NumInstr >= SkipThreshold)
return true;
}
}
return false;
}
MachineInstr *SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
return nullptr;
const DebugLoc &DL = From.getDebugLoc();
MachineInstr *Skip =
BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
.addOperand(To);
return Skip;
}
bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction *MF = MBB.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
!shouldSkip(&MBB, &MBB.getParent()->back()))
return false;
MachineOperand &SaveExec = MI.getOperand(0);
MachineOperand &Cond = MI.getOperand(1);
assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
Cond.getSubReg() == AMDGPU::NoSubRegister);
MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
MBB.addSuccessor(SkipBB);
unsigned SaveExecReg = SaveExec.getReg();
MachineInstr *AndSaveExec =
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg)
.addOperand(Cond);
MachineInstr *Xor =
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
.addReg(AMDGPU::EXEC)
.addReg(SaveExecReg);
// Insert a pseudo terminator to help keep the verifier happy. This will also
// be used later when inserting skips.
MachineInstr *NewBr =
BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
.addOperand(MI.getOperand(2))
.addReg(SaveExecReg, getKillRegState(SaveExec.isKill()));
if (!LIS) {
MI.eraseFromParent();
return;
}
LIS->ReplaceMachineInstrInMaps(MI, *AndSaveExec);
LIS->InsertMachineInstrInMaps(*Xor);
LIS->InsertMachineInstrInMaps(*NewBr);
MI.eraseFromParent();
// FIXME: Is there a better way of adjusting the liveness? It shouldn't be
// hard to add another def here but I'm not sure how to correctly update the
// valno.
LIS->removeInterval(SaveExecReg);
LIS->createAndComputeVirtRegInterval(SaveExecReg);
}
void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
// If the exec mask is non-zero, skip the next two instructions
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addMBB(&NextBB);
unsigned DstReg = MI.getOperand(0).getReg();
assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
MachineBasicBlock::iterator Insert = SkipBB->begin();
bool ExecModified = MI.getOperand(3).getImm() != 0;
MachineBasicBlock::iterator Start = MBB.begin();
// Exec mask is zero: Export to NULL target...
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
.addImm(0)
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
.addImm(0)
.addImm(1)
.addImm(1)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef);
// This must be inserted before phis and any spill code inserted before the
// else.
MachineInstr *OrSaveExec =
BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg)
.addOperand(MI.getOperand(1)); // Saved EXEC
MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
// ... and terminate wavefront.
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
MachineBasicBlock::iterator ElsePt(MI);
return true;
}
if (ExecModified) {
MachineInstr *And =
BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
.addReg(AMDGPU::EXEC)
.addReg(DstReg);
void SILowerControlFlow::If(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
unsigned Vcc = MI.getOperand(1).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
.addReg(Vcc);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
.addReg(AMDGPU::EXEC)
.addReg(Reg);
MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
// Insert before the new branch instruction.
MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
// Insert a pseudo terminator to help keep the verifier happy.
BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
.addOperand(MI.getOperand(2))
.addReg(Reg);
MI.eraseFromParent();
}
void SILowerControlFlow::Else(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
.addReg(Src); // Saved EXEC
if (MI.getOperand(3).getImm() != 0) {
// Adjust the saved exec to account for the modifications during the flow
// block that contains the ELSE. This can happen when WQM mode is switched
// off.
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
if (LIS)
LIS->InsertMachineInstrInMaps(*And);
}
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
// Insert before the new branch instruction.
MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
MachineInstr *Xor =
BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(DstReg);
MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
// Insert a pseudo terminator to help keep the verifier happy.
BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
.addOperand(MI.getOperand(2))
.addReg(Dst);
MachineInstr *Branch =
BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
.addMBB(DestBB)
.addReg(DstReg);
if (!LIS) {
MI.eraseFromParent();
return;
}
LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();
LIS->InsertMachineInstrInMaps(*OrSaveExec);
LIS->InsertMachineInstrInMaps(*Xor);
LIS->InsertMachineInstrInMaps(*Branch);
// src reg is tied to dst reg.
LIS->removeInterval(DstReg);
LIS->createAndComputeVirtRegInterval(DstReg);
// Let this be recomputed.
LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
}
void SILowerControlFlow::Break(MachineInstr &MI) {
void SILowerControlFlow::emitBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
const DebugLoc &DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(AMDGPU::EXEC)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlow::IfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Vcc = MI.getOperand(1).getReg();
unsigned Src = MI.getOperand(2).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(Vcc)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Saved = MI.getOperand(1).getReg();
unsigned Src = MI.getOperand(2).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(Saved)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlow::Loop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Src = MI.getOperand(0).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Src);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
MachineInstr *Or =
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(AMDGPU::EXEC)
.addOperand(MI.getOperand(1));
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *Or);
MI.eraseFromParent();
}
void SILowerControlFlow::EndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Reg);
MI.eraseFromParent();
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
MI.setDesc(TII->get(AMDGPU::S_OR_B64));
}
void SILowerControlFlow::Branch(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
if (MBB == MI.getParent()->getNextNode())
MI.eraseFromParent();
// If these aren't equal, this is probably an infinite loop.
void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
MI.setDesc(TII->get(AMDGPU::S_OR_B64));
}
void SILowerControlFlow::Kill(MachineInstr &MI) {
void SILowerControlFlow::emitLoop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Op = MI.getOperand(0);
const DebugLoc &DL = MI.getDebugLoc();
#ifndef NDEBUG
CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
// Kill is only allowed in pixel / geometry shaders.
assert(CallConv == CallingConv::AMDGPU_PS ||
CallConv == CallingConv::AMDGPU_GS);
#endif
MachineInstr *AndN2 =
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addOperand(MI.getOperand(0));
// Clear this thread from the exec mask if the operand is negative
if ((Op.isImm())) {
// Constant operand: Set exec mask to 0 or do nothing
if (Op.getImm() & 0x80000000) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addImm(0);
}
} else {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
.addImm(0)
.addOperand(Op);
MachineInstr *Branch =
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addOperand(MI.getOperand(1));
if (LIS) {
LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
LIS->InsertMachineInstrInMaps(*Branch);
}
MI.eraseFromParent();
}
MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
MachineFunction *MF = MBB.getParent();
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MachineBasicBlock::iterator InsPt = MBB.begin();
MachineInstr *NewMI =
BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addOperand(MI.getOperand(0));
MF->insert(MBBI, SkipBB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
return SkipBB;
MI.eraseFromParent();
if (LIS)
LIS->handleMove(*NewMI);
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
SkipThreshold = SkipThresholdFlag;
bool HaveKill = false;
unsigned Depth = 0;
// This doesn't actually need LiveIntervals, but we can preserve them.
LIS = getAnalysisIfAvailable<LiveIntervals>();
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; BI = NextBB) {
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
default: break;
case AMDGPU::SI_IF:
++Depth;
If(MI);
break;
case AMDGPU::SI_IF:
emitIf(MI);
break;
case AMDGPU::SI_ELSE:
Else(MI);
break;
case AMDGPU::SI_ELSE:
emitElse(MI);
break;
case AMDGPU::SI_BREAK:
Break(MI);
break;
case AMDGPU::SI_BREAK:
emitBreak(MI);
break;
case AMDGPU::SI_IF_BREAK:
IfBreak(MI);
break;
case AMDGPU::SI_IF_BREAK:
emitIfBreak(MI);
break;
case AMDGPU::SI_ELSE_BREAK:
ElseBreak(MI);
break;
case AMDGPU::SI_ELSE_BREAK:
emitElseBreak(MI);
break;
case AMDGPU::SI_LOOP:
++Depth;
Loop(MI);
break;
case AMDGPU::SI_LOOP:
emitLoop(MI);
break;
case AMDGPU::SI_END_CF:
if (--Depth == 0 && HaveKill) {
HaveKill = false;
// TODO: Insert skip if exec is 0?
}
case AMDGPU::SI_END_CF:
emitEndCf(MI);
break;
EndCf(MI);
break;
case AMDGPU::SI_KILL_TERMINATOR:
if (Depth == 0) {
if (skipIfDead(MI, *NextBB)) {
NextBB = std::next(BI);
BE = MF.end();
}
} else
HaveKill = true;
Kill(MI);
break;
case AMDGPU::S_BRANCH:
Branch(MI);
break;
case AMDGPU::SI_RETURN: {
assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
// because external bytecode will be appended at the end.
if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
// SI_RETURN is not the last instruction. Add an empty block at
// the end and jump there.
if (!EmptyMBBAtEnd) {
EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
MF.insert(MF.end(), EmptyMBBAtEnd);
}
MBB.addSuccessor(EmptyMBBAtEnd);
BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
.addMBB(EmptyMBBAtEnd);
I->eraseFromParent();
}
break;
}
default:
break;
}
}
}
return true;
}

View File

@ -25,11 +25,13 @@ end:
}
; CHECK-LABEL: {{^}}else_execfix_leave_wqm:
; CHECK: ; BB#0:
; CHECK-NEXT: s_mov_b64 [[INIT_EXEC:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
; CHECK-NEXT: s_and_b64 exec, exec,
; CHECK-NEXT: s_and_b64 [[DST]], exec, [[DST]]
; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
; CHECK-NEXT: ; mask branch
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) {
main_body:

View File

@ -2,11 +2,33 @@
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; SI-LABEL: @test_if
; SI-LABEL: {{^}}test_if:
; Make sure the i1 values created by the cfg structurizer pass are
; moved using VALU instructions
; waitcnt should be inserted after exec modification
; SI: v_cmp_lt_i32_e32 vcc, 0,
; SI-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
; SI-NEXT: s_xor_b64 [[SAVE]], exec, [[SAVE]]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
; SI: v_mov_b32_e32 v{{[0-9]}}, -1
; SI: s_and_saveexec_b64
; SI-NEXT: s_xor_b64
; SI-NEXT: ; mask branch
; v_mov should be after exec modification
; SI: [[FLOW_BB]]:
; SI-NEXT: s_or_saveexec_b64 [[SAVE]], [[SAVE]]
; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
; SI-NEXT: s_xor_b64 exec, exec, [[SAVE]]
; SI-NEXT: ; mask branch
;
define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@ -17,12 +39,12 @@ entry:
case0:
%arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
store i32 0, i32 addrspace(1)* %arrayidx1, align 4
store i32 13, i32 addrspace(1)* %arrayidx1, align 4
br label %end
case1:
%arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
store i32 1, i32 addrspace(1)* %arrayidx5, align 4
store i32 17, i32 addrspace(1)* %arrayidx5, align 4
br label %end
default:
@ -31,11 +53,11 @@ default:
br i1 %cmp8, label %if, label %else
if:
store i32 2, i32 addrspace(1)* %arrayidx10, align 4
store i32 19, i32 addrspace(1)* %arrayidx10, align 4
br label %end
else:
store i32 3, i32 addrspace(1)* %arrayidx10, align 4
store i32 21, i32 addrspace(1)* %arrayidx10, align 4
br label %end
end:
@ -139,10 +161,11 @@ exit:
; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
; SI: [[LABEL_FLOW]]:
; SI: s_or_b64 exec, exec, [[ORNEG2]]
; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
; SI: s_cbranch_execnz [[LABEL_LOOP]]
; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
; SI: BB#5
; SI: s_or_b64 exec, exec, [[COND_STATE]]