mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-24 03:25:00 +00:00
AMDGPU: Split SILowerControlFlow into two pieces
Do most of the lowering in a pre-RA pass. Keep the skip jump insertion late, plus a few other things that require more work to move out. One concern I have is now there may be COPY instructions which do not have the necessary implicit exec uses if they will be lowered to v_mov_b32. This has a positive effect on SGPR usage in shader-db. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@279464 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
ce35dd29a5
commit
7517ed227a
@ -39,7 +39,6 @@ FunctionPass *createSILowerI1CopiesPass();
|
||||
FunctionPass *createSIShrinkInstructionsPass();
|
||||
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
|
||||
FunctionPass *createSIWholeQuadModePass();
|
||||
FunctionPass *createSILowerControlFlowPass();
|
||||
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
|
||||
FunctionPass *createSIFixSGPRCopiesPass();
|
||||
FunctionPass *createSIDebuggerInsertNopsPass();
|
||||
@ -69,8 +68,10 @@ void initializeSIWholeQuadModePass(PassRegistry &);
|
||||
extern char &SIWholeQuadModeID;
|
||||
|
||||
void initializeSILowerControlFlowPass(PassRegistry &);
|
||||
extern char &SILowerControlFlowPassID;
|
||||
extern char &SILowerControlFlowID;
|
||||
|
||||
void initializeSIInsertSkipsPass(PassRegistry &);
|
||||
extern char &SIInsertSkipsPassID;
|
||||
|
||||
// Passes common to R600 and SI
|
||||
FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
|
||||
|
@ -80,6 +80,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
|
||||
initializeSIInsertWaitsPass(*PR);
|
||||
initializeSIWholeQuadModePass(*PR);
|
||||
initializeSILowerControlFlowPass(*PR);
|
||||
initializeSIInsertSkipsPass(*PR);
|
||||
initializeSIDebuggerInsertNopsPass(*PR);
|
||||
}
|
||||
|
||||
@ -532,13 +533,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
|
||||
#endif
|
||||
|
||||
void GCNPassConfig::addPreRegAlloc() {
|
||||
// This needs to be run directly before register allocation because
|
||||
// earlier passes might recompute live intervals.
|
||||
// TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
|
||||
if (getOptLevel() > CodeGenOpt::None) {
|
||||
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
|
||||
}
|
||||
|
||||
if (getOptLevel() > CodeGenOpt::None) {
|
||||
// Don't do this with no optimizations since it throws away debug info by
|
||||
// merging nonadjacent loads.
|
||||
@ -556,10 +550,22 @@ void GCNPassConfig::addPreRegAlloc() {
|
||||
}
|
||||
|
||||
void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
|
||||
// FIXME: We have to disable the verifier here because of PHIElimination +
|
||||
// TwoAddressInstructions disabling it.
|
||||
insertPass(&TwoAddressInstructionPassID, &SILowerControlFlowID, false);
|
||||
|
||||
TargetPassConfig::addFastRegAlloc(RegAllocPass);
|
||||
}
|
||||
|
||||
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
|
||||
// This needs to be run directly before register allocation because earlier
|
||||
// passes might recompute live intervals.
|
||||
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
|
||||
|
||||
// TODO: It might be better to run this right after phi elimination, but for
|
||||
// now that would require not running the verifier.
|
||||
insertPass(&RenameIndependentSubregsID, &SILowerControlFlowID);
|
||||
|
||||
TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
|
||||
}
|
||||
|
||||
@ -579,7 +585,7 @@ void GCNPassConfig::addPreEmitPass() {
|
||||
|
||||
addPass(createSIInsertWaitsPass());
|
||||
addPass(createSIShrinkInstructionsPass());
|
||||
addPass(createSILowerControlFlowPass());
|
||||
addPass(&SIInsertSkipsPassID);
|
||||
addPass(createSIDebuggerInsertNopsPass());
|
||||
}
|
||||
|
||||
|
@ -67,6 +67,7 @@ add_llvm_target(AMDGPUCodeGen
|
||||
SIFixSGPRCopies.cpp
|
||||
SIFoldOperands.cpp
|
||||
SIFrameLowering.cpp
|
||||
SIInsertSkips.cpp
|
||||
SIInsertWaits.cpp
|
||||
SIInstrInfo.cpp
|
||||
SIISelLowering.cpp
|
||||
|
330
lib/Target/AMDGPU/SIInsertSkips.cpp
Normal file
330
lib/Target/AMDGPU/SIInsertSkips.cpp
Normal file
@ -0,0 +1,330 @@
|
||||
//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// \brief This pass inserts branches on the 0 exec mask over divergent branches
|
||||
/// branches when it's expected that jumping over the untaken control flow will
|
||||
/// be cheaper than having every workitem no-op through it.
|
||||
//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/MC/MCAsmInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "si-insert-skips"
|
||||
|
||||
namespace {
|
||||
|
||||
static cl::opt<unsigned> SkipThresholdFlag(
|
||||
"amdgpu-skip-threshold",
|
||||
cl::desc("Number of instructions before jumping over divergent control flow"),
|
||||
cl::init(12), cl::Hidden);
|
||||
|
||||
class SIInsertSkips : public MachineFunctionPass {
|
||||
private:
|
||||
const SIRegisterInfo *TRI;
|
||||
const SIInstrInfo *TII;
|
||||
unsigned SkipThreshold;
|
||||
|
||||
bool shouldSkip(const MachineBasicBlock &From,
|
||||
const MachineBasicBlock &To) const;
|
||||
|
||||
bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
|
||||
|
||||
void kill(MachineInstr &MI);
|
||||
|
||||
MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I) const;
|
||||
|
||||
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
SIInsertSkips() :
|
||||
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
|
||||
const char *getPassName() const override {
|
||||
return "SI insert s_cbranch_execz instructions";
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
};
|
||||
|
||||
} // End anonymous namespace
|
||||
|
||||
char SIInsertSkips::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
|
||||
"SI insert s_cbranch_execz instructions", false, false)
|
||||
|
||||
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
|
||||
|
||||
static bool opcodeEmitsNoInsts(unsigned Opc) {
|
||||
switch (Opc) {
|
||||
case TargetOpcode::IMPLICIT_DEF:
|
||||
case TargetOpcode::KILL:
|
||||
case TargetOpcode::BUNDLE:
|
||||
case TargetOpcode::CFI_INSTRUCTION:
|
||||
case TargetOpcode::EH_LABEL:
|
||||
case TargetOpcode::GC_LABEL:
|
||||
case TargetOpcode::DBG_VALUE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
|
||||
const MachineBasicBlock &To) const {
|
||||
if (From.succ_empty())
|
||||
return false;
|
||||
|
||||
unsigned NumInstr = 0;
|
||||
const MachineFunction *MF = From.getParent();
|
||||
|
||||
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
|
||||
MBBI != End && MBBI != ToI; ++MBBI) {
|
||||
const MachineBasicBlock &MBB = *MBBI;
|
||||
|
||||
for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
|
||||
NumInstr < SkipThreshold && I != E; ++I) {
|
||||
if (opcodeEmitsNoInsts(I->getOpcode()))
|
||||
continue;
|
||||
|
||||
// FIXME: Since this is required for correctness, this should be inserted
|
||||
// during SILowerControlFlow.
|
||||
|
||||
// When a uniform loop is inside non-uniform control flow, the branch
|
||||
// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
|
||||
// when EXEC = 0. We should skip the loop lest it becomes infinite.
|
||||
if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
|
||||
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
|
||||
return true;
|
||||
|
||||
if (I->isInlineAsm()) {
|
||||
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
|
||||
const char *AsmStr = I->getOperand(0).getSymbolName();
|
||||
|
||||
// inlineasm length estimate is number of bytes assuming the longest
|
||||
// instruction.
|
||||
uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
|
||||
NumInstr += MaxAsmSize / MAI->getMaxInstLength();
|
||||
} else {
|
||||
++NumInstr;
|
||||
}
|
||||
|
||||
if (NumInstr >= SkipThreshold)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
|
||||
if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
|
||||
!shouldSkip(MBB, MBB.getParent()->back()))
|
||||
return false;
|
||||
|
||||
MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
|
||||
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
// If the exec mask is non-zero, skip the next two instructions
|
||||
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
||||
.addMBB(&NextBB);
|
||||
|
||||
MachineBasicBlock::iterator Insert = SkipBB->begin();
|
||||
|
||||
// Exec mask is zero: Export to NULL target...
|
||||
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
|
||||
.addImm(0)
|
||||
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
|
||||
.addImm(0)
|
||||
.addImm(1)
|
||||
.addImm(1)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef);
|
||||
|
||||
// ... and terminate wavefront.
|
||||
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void SIInsertSkips::kill(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
const MachineOperand &Op = MI.getOperand(0);
|
||||
|
||||
#ifndef NDEBUG
|
||||
CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
|
||||
// Kill is only allowed in pixel / geometry shaders.
|
||||
assert(CallConv == CallingConv::AMDGPU_PS ||
|
||||
CallConv == CallingConv::AMDGPU_GS);
|
||||
#endif
|
||||
// Clear this thread from the exec mask if the operand is negative.
|
||||
if (Op.isImm()) {
|
||||
// Constant operand: Set exec mask to 0 or do nothing
|
||||
if (Op.getImm() & 0x80000000) {
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
|
||||
.addImm(0);
|
||||
}
|
||||
} else {
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
|
||||
.addImm(0)
|
||||
.addOperand(Op);
|
||||
}
|
||||
}
|
||||
|
||||
MachineBasicBlock *SIInsertSkips::insertSkipBlock(
|
||||
MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
|
||||
MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
|
||||
MachineFunction::iterator MBBI(MBB);
|
||||
++MBBI;
|
||||
|
||||
MF->insert(MBBI, SkipBB);
|
||||
MBB.addSuccessor(SkipBB);
|
||||
|
||||
return SkipBB;
|
||||
}
|
||||
|
||||
// Returns true if a branch over the block was inserted.
|
||||
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
|
||||
MachineBasicBlock &SrcMBB) {
|
||||
MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
|
||||
|
||||
if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
|
||||
return false;
|
||||
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
|
||||
|
||||
BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
|
||||
.addMBB(DestBB);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
TII = ST.getInstrInfo();
|
||||
TRI = &TII->getRegisterInfo();
|
||||
SkipThreshold = SkipThresholdFlag;
|
||||
|
||||
bool HaveKill = false;
|
||||
bool MadeChange = false;
|
||||
|
||||
// Track depth of exec mask, divergent branches.
|
||||
SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
|
||||
|
||||
MachineFunction::iterator NextBB;
|
||||
|
||||
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
|
||||
|
||||
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
||||
BI != BE; BI = NextBB) {
|
||||
NextBB = std::next(BI);
|
||||
MachineBasicBlock &MBB = *BI;
|
||||
|
||||
if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
|
||||
// Reached convergence point for last divergent branch.
|
||||
ExecBranchStack.pop_back();
|
||||
}
|
||||
|
||||
if (HaveKill && ExecBranchStack.empty()) {
|
||||
HaveKill = false;
|
||||
|
||||
// TODO: Insert skip if exec is 0?
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator I, Next;
|
||||
for (I = MBB.begin(); I != MBB.end(); I = Next) {
|
||||
Next = std::next(I);
|
||||
|
||||
MachineInstr &MI = *I;
|
||||
|
||||
switch (MI.getOpcode()) {
|
||||
case AMDGPU::SI_MASK_BRANCH: {
|
||||
ExecBranchStack.push_back(MI.getOperand(0).getMBB());
|
||||
MadeChange |= skipMaskBranch(MI, MBB);
|
||||
break;
|
||||
}
|
||||
case AMDGPU::S_BRANCH: {
|
||||
// Optimize out branches to the next block.
|
||||
// FIXME: Shouldn't this be handled by BranchFolding?
|
||||
if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
|
||||
MI.eraseFromParent();
|
||||
break;
|
||||
}
|
||||
case AMDGPU::SI_KILL_TERMINATOR: {
|
||||
MadeChange = true;
|
||||
kill(MI);
|
||||
|
||||
if (ExecBranchStack.empty()) {
|
||||
if (skipIfDead(MI, *NextBB)) {
|
||||
NextBB = std::next(BI);
|
||||
BE = MF.end();
|
||||
Next = MBB.end();
|
||||
}
|
||||
} else {
|
||||
HaveKill = true;
|
||||
}
|
||||
|
||||
MI.eraseFromParent();
|
||||
break;
|
||||
}
|
||||
case AMDGPU::SI_RETURN: {
|
||||
// FIXME: Should move somewhere else
|
||||
assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
|
||||
|
||||
// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
|
||||
// because external bytecode will be appended at the end.
|
||||
if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
|
||||
// SI_RETURN is not the last instruction. Add an empty block at
|
||||
// the end and jump there.
|
||||
if (!EmptyMBBAtEnd) {
|
||||
EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
|
||||
MF.insert(MF.end(), EmptyMBBAtEnd);
|
||||
}
|
||||
|
||||
MBB.addSuccessor(EmptyMBBAtEnd);
|
||||
BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
|
||||
.addMBB(EmptyMBBAtEnd);
|
||||
I->eraseFromParent();
|
||||
}
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return MadeChange;
|
||||
}
|
@ -1807,6 +1807,7 @@ def SI_MASK_BRANCH : PseudoInstSI <
|
||||
let isTerminator = 1;
|
||||
let isBarrier = 0;
|
||||
let SALU = 1;
|
||||
let Uses = [EXEC];
|
||||
}
|
||||
|
||||
let Uses = [EXEC], Defs = [EXEC, SCC] in {
|
||||
|
@ -58,8 +58,6 @@
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/MC/MCAsmInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
@ -67,46 +65,41 @@ using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
static cl::opt<unsigned> SkipThresholdFlag(
|
||||
"amdgpu-skip-threshold",
|
||||
cl::desc("Number of instructions before jumping over divergent control flow"),
|
||||
cl::init(12), cl::Hidden);
|
||||
|
||||
class SILowerControlFlow : public MachineFunctionPass {
|
||||
private:
|
||||
const SIRegisterInfo *TRI;
|
||||
const SIInstrInfo *TII;
|
||||
unsigned SkipThreshold;
|
||||
LiveIntervals *LIS;
|
||||
|
||||
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
|
||||
void emitIf(MachineInstr &MI);
|
||||
void emitElse(MachineInstr &MI);
|
||||
void emitBreak(MachineInstr &MI);
|
||||
void emitIfBreak(MachineInstr &MI);
|
||||
void emitElseBreak(MachineInstr &MI);
|
||||
void emitLoop(MachineInstr &MI);
|
||||
void emitEndCf(MachineInstr &MI);
|
||||
|
||||
MachineInstr *Skip(MachineInstr &From, MachineOperand &To);
|
||||
bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
|
||||
|
||||
void If(MachineInstr &MI);
|
||||
void Else(MachineInstr &MI);
|
||||
void Break(MachineInstr &MI);
|
||||
void IfBreak(MachineInstr &MI);
|
||||
void ElseBreak(MachineInstr &MI);
|
||||
void Loop(MachineInstr &MI);
|
||||
void EndCf(MachineInstr &MI);
|
||||
|
||||
void Kill(MachineInstr &MI);
|
||||
void Branch(MachineInstr &MI);
|
||||
|
||||
MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I) const;
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
SILowerControlFlow() :
|
||||
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
|
||||
MachineFunctionPass(ID),
|
||||
TRI(nullptr),
|
||||
TII(nullptr),
|
||||
LIS(nullptr) {}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
|
||||
const char *getPassName() const override {
|
||||
return "SI Lower control flow pseudo instructions";
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addPreserved<LiveIntervals>();
|
||||
AU.addPreserved<SlotIndexes>();
|
||||
AU.setPreservesCFG();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
};
|
||||
|
||||
} // End anonymous namespace
|
||||
@ -114,403 +107,236 @@ public:
|
||||
char SILowerControlFlow::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
|
||||
"SI lower control flow", false, false)
|
||||
"SI lower control flow", false, false)
|
||||
|
||||
char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
|
||||
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
|
||||
|
||||
|
||||
FunctionPass *llvm::createSILowerControlFlowPass() {
|
||||
return new SILowerControlFlow();
|
||||
}
|
||||
|
||||
static bool opcodeEmitsNoInsts(unsigned Opc) {
|
||||
switch (Opc) {
|
||||
case TargetOpcode::IMPLICIT_DEF:
|
||||
case TargetOpcode::KILL:
|
||||
case TargetOpcode::BUNDLE:
|
||||
case TargetOpcode::CFI_INSTRUCTION:
|
||||
case TargetOpcode::EH_LABEL:
|
||||
case TargetOpcode::GC_LABEL:
|
||||
case TargetOpcode::DBG_VALUE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
|
||||
MachineBasicBlock *To) {
|
||||
if (From->succ_empty())
|
||||
return false;
|
||||
|
||||
unsigned NumInstr = 0;
|
||||
MachineFunction *MF = From->getParent();
|
||||
|
||||
for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
|
||||
MBBI != End && MBBI != ToI; ++MBBI) {
|
||||
MachineBasicBlock &MBB = *MBBI;
|
||||
|
||||
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
|
||||
NumInstr < SkipThreshold && I != E; ++I) {
|
||||
if (opcodeEmitsNoInsts(I->getOpcode()))
|
||||
continue;
|
||||
|
||||
// When a uniform loop is inside non-uniform control flow, the branch
|
||||
// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
|
||||
// when EXEC = 0. We should skip the loop lest it becomes infinite.
|
||||
if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
|
||||
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
|
||||
return true;
|
||||
|
||||
if (I->isInlineAsm()) {
|
||||
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
|
||||
const char *AsmStr = I->getOperand(0).getSymbolName();
|
||||
|
||||
// inlineasm length estimate is number of bytes assuming the longest
|
||||
// instruction.
|
||||
uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
|
||||
NumInstr += MaxAsmSize / MAI->getMaxInstLength();
|
||||
} else {
|
||||
++NumInstr;
|
||||
}
|
||||
|
||||
if (NumInstr >= SkipThreshold)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
MachineInstr *SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
|
||||
if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
|
||||
return nullptr;
|
||||
|
||||
const DebugLoc &DL = From.getDebugLoc();
|
||||
MachineInstr *Skip =
|
||||
BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
|
||||
.addOperand(To);
|
||||
return Skip;
|
||||
}
|
||||
|
||||
bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
|
||||
void SILowerControlFlow::emitIf(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
MachineBasicBlock::iterator I(&MI);
|
||||
|
||||
if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
|
||||
!shouldSkip(&MBB, &MBB.getParent()->back()))
|
||||
return false;
|
||||
MachineOperand &SaveExec = MI.getOperand(0);
|
||||
MachineOperand &Cond = MI.getOperand(1);
|
||||
assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
|
||||
Cond.getSubReg() == AMDGPU::NoSubRegister);
|
||||
|
||||
MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
|
||||
MBB.addSuccessor(SkipBB);
|
||||
unsigned SaveExecReg = SaveExec.getReg();
|
||||
|
||||
MachineInstr *AndSaveExec =
|
||||
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg)
|
||||
.addOperand(Cond);
|
||||
|
||||
MachineInstr *Xor =
|
||||
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(SaveExecReg);
|
||||
|
||||
// Insert a pseudo terminator to help keep the verifier happy. This will also
|
||||
// be used later when inserting skips.
|
||||
MachineInstr *NewBr =
|
||||
BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
|
||||
.addOperand(MI.getOperand(2))
|
||||
.addReg(SaveExecReg, getKillRegState(SaveExec.isKill()));
|
||||
|
||||
if (!LIS) {
|
||||
MI.eraseFromParent();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
LIS->ReplaceMachineInstrInMaps(MI, *AndSaveExec);
|
||||
LIS->InsertMachineInstrInMaps(*Xor);
|
||||
LIS->InsertMachineInstrInMaps(*NewBr);
|
||||
|
||||
MI.eraseFromParent();
|
||||
|
||||
// FIXME: Is there a better way of adjusting the liveness? It shouldn't be
|
||||
// hard to add another def here but I'm not sure how to correctly update the
|
||||
// valno.
|
||||
LIS->removeInterval(SaveExecReg);
|
||||
LIS->createAndComputeVirtRegInterval(SaveExecReg);
|
||||
}
|
||||
|
||||
void SILowerControlFlow::emitElse(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
// If the exec mask is non-zero, skip the next two instructions
|
||||
BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
||||
.addMBB(&NextBB);
|
||||
unsigned DstReg = MI.getOperand(0).getReg();
|
||||
assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
|
||||
|
||||
MachineBasicBlock::iterator Insert = SkipBB->begin();
|
||||
bool ExecModified = MI.getOperand(3).getImm() != 0;
|
||||
MachineBasicBlock::iterator Start = MBB.begin();
|
||||
|
||||
// Exec mask is zero: Export to NULL target...
|
||||
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
|
||||
.addImm(0)
|
||||
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
|
||||
.addImm(0)
|
||||
.addImm(1)
|
||||
.addImm(1)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef)
|
||||
.addReg(AMDGPU::VGPR0, RegState::Undef);
|
||||
// This must be inserted before phis and any spill code inserted before the
|
||||
// else.
|
||||
MachineInstr *OrSaveExec =
|
||||
BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg)
|
||||
.addOperand(MI.getOperand(1)); // Saved EXEC
|
||||
MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
|
||||
|
||||
// ... and terminate wavefront.
|
||||
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
|
||||
MachineBasicBlock::iterator ElsePt(MI);
|
||||
|
||||
return true;
|
||||
}
|
||||
if (ExecModified) {
|
||||
MachineInstr *And =
|
||||
BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(DstReg);
|
||||
|
||||
void SILowerControlFlow::If(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
unsigned Reg = MI.getOperand(0).getReg();
|
||||
unsigned Vcc = MI.getOperand(1).getReg();
|
||||
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
|
||||
.addReg(Vcc);
|
||||
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(Reg);
|
||||
|
||||
MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
|
||||
|
||||
// Insert before the new branch instruction.
|
||||
MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
|
||||
|
||||
// Insert a pseudo terminator to help keep the verifier happy.
|
||||
BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
|
||||
.addOperand(MI.getOperand(2))
|
||||
.addReg(Reg);
|
||||
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void SILowerControlFlow::Else(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
unsigned Src = MI.getOperand(1).getReg();
|
||||
|
||||
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
|
||||
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
|
||||
.addReg(Src); // Saved EXEC
|
||||
|
||||
if (MI.getOperand(3).getImm() != 0) {
|
||||
// Adjust the saved exec to account for the modifications during the flow
|
||||
// block that contains the ELSE. This can happen when WQM mode is switched
|
||||
// off.
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(Dst);
|
||||
if (LIS)
|
||||
LIS->InsertMachineInstrInMaps(*And);
|
||||
}
|
||||
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(Dst);
|
||||
|
||||
MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
|
||||
|
||||
// Insert before the new branch instruction.
|
||||
MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
|
||||
MachineInstr *Xor =
|
||||
BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(DstReg);
|
||||
|
||||
MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
|
||||
// Insert a pseudo terminator to help keep the verifier happy.
|
||||
BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
|
||||
.addOperand(MI.getOperand(2))
|
||||
.addReg(Dst);
|
||||
MachineInstr *Branch =
|
||||
BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
|
||||
.addMBB(DestBB)
|
||||
.addReg(DstReg);
|
||||
|
||||
if (!LIS) {
|
||||
MI.eraseFromParent();
|
||||
return;
|
||||
}
|
||||
|
||||
LIS->RemoveMachineInstrFromMaps(MI);
|
||||
MI.eraseFromParent();
|
||||
|
||||
LIS->InsertMachineInstrInMaps(*OrSaveExec);
|
||||
|
||||
LIS->InsertMachineInstrInMaps(*Xor);
|
||||
LIS->InsertMachineInstrInMaps(*Branch);
|
||||
|
||||
// src reg is tied to dst reg.
|
||||
LIS->removeInterval(DstReg);
|
||||
LIS->createAndComputeVirtRegInterval(DstReg);
|
||||
|
||||
// Let this be recomputed.
|
||||
LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
|
||||
}
|
||||
|
||||
void SILowerControlFlow::Break(MachineInstr &MI) {
|
||||
void SILowerControlFlow::emitBreak(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
unsigned Src = MI.getOperand(1).getReg();
|
||||
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(Src);
|
||||
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void SILowerControlFlow::IfBreak(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
unsigned Vcc = MI.getOperand(1).getReg();
|
||||
unsigned Src = MI.getOperand(2).getReg();
|
||||
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
||||
.addReg(Vcc)
|
||||
.addReg(Src);
|
||||
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
unsigned Saved = MI.getOperand(1).getReg();
|
||||
unsigned Src = MI.getOperand(2).getReg();
|
||||
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
||||
.addReg(Saved)
|
||||
.addReg(Src);
|
||||
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void SILowerControlFlow::Loop(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
unsigned Src = MI.getOperand(0).getReg();
|
||||
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(Src);
|
||||
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
||||
MachineInstr *Or =
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addOperand(MI.getOperand(1));
|
||||
|
||||
if (LIS)
|
||||
LIS->ReplaceMachineInstrInMaps(MI, *Or);
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void SILowerControlFlow::EndCf(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
unsigned Reg = MI.getOperand(0).getReg();
|
||||
|
||||
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
|
||||
TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(Reg);
|
||||
|
||||
MI.eraseFromParent();
|
||||
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
|
||||
MI.setDesc(TII->get(AMDGPU::S_OR_B64));
|
||||
}
|
||||
|
||||
void SILowerControlFlow::Branch(MachineInstr &MI) {
|
||||
MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
|
||||
if (MBB == MI.getParent()->getNextNode())
|
||||
MI.eraseFromParent();
|
||||
|
||||
// If these aren't equal, this is probably an infinite loop.
|
||||
void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
|
||||
MI.setDesc(TII->get(AMDGPU::S_OR_B64));
|
||||
}
|
||||
|
||||
void SILowerControlFlow::Kill(MachineInstr &MI) {
|
||||
void SILowerControlFlow::emitLoop(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
const MachineOperand &Op = MI.getOperand(0);
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
#ifndef NDEBUG
|
||||
CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
|
||||
// Kill is only allowed in pixel / geometry shaders.
|
||||
assert(CallConv == CallingConv::AMDGPU_PS ||
|
||||
CallConv == CallingConv::AMDGPU_GS);
|
||||
#endif
|
||||
MachineInstr *AndN2 =
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addOperand(MI.getOperand(0));
|
||||
|
||||
// Clear this thread from the exec mask if the operand is negative
|
||||
if ((Op.isImm())) {
|
||||
// Constant operand: Set exec mask to 0 or do nothing
|
||||
if (Op.getImm() & 0x80000000) {
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
|
||||
.addImm(0);
|
||||
}
|
||||
} else {
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
|
||||
.addImm(0)
|
||||
.addOperand(Op);
|
||||
MachineInstr *Branch =
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
||||
.addOperand(MI.getOperand(1));
|
||||
|
||||
if (LIS) {
|
||||
LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
|
||||
LIS->InsertMachineInstrInMaps(*Branch);
|
||||
}
|
||||
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
|
||||
MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
|
||||
MachineFunction::iterator MBBI(MBB);
|
||||
++MBBI;
|
||||
MachineBasicBlock::iterator InsPt = MBB.begin();
|
||||
MachineInstr *NewMI =
|
||||
BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addOperand(MI.getOperand(0));
|
||||
|
||||
MF->insert(MBBI, SkipBB);
|
||||
if (LIS)
|
||||
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
|
||||
|
||||
return SkipBB;
|
||||
MI.eraseFromParent();
|
||||
|
||||
if (LIS)
|
||||
LIS->handleMove(*NewMI);
|
||||
}
|
||||
|
||||
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
TII = ST.getInstrInfo();
|
||||
TRI = &TII->getRegisterInfo();
|
||||
SkipThreshold = SkipThresholdFlag;
|
||||
|
||||
bool HaveKill = false;
|
||||
unsigned Depth = 0;
|
||||
// This doesn't actually need LiveIntervals, but we can preserve them.
|
||||
LIS = getAnalysisIfAvailable<LiveIntervals>();
|
||||
|
||||
MachineFunction::iterator NextBB;
|
||||
|
||||
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
||||
BI != BE; BI = NextBB) {
|
||||
NextBB = std::next(BI);
|
||||
MachineBasicBlock &MBB = *BI;
|
||||
|
||||
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
|
||||
MachineBasicBlock::iterator I, Next;
|
||||
|
||||
for (I = MBB.begin(); I != MBB.end(); I = Next) {
|
||||
Next = std::next(I);
|
||||
|
||||
MachineInstr &MI = *I;
|
||||
|
||||
switch (MI.getOpcode()) {
|
||||
default: break;
|
||||
case AMDGPU::SI_IF:
|
||||
++Depth;
|
||||
If(MI);
|
||||
break;
|
||||
case AMDGPU::SI_IF:
|
||||
emitIf(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_ELSE:
|
||||
Else(MI);
|
||||
break;
|
||||
case AMDGPU::SI_ELSE:
|
||||
emitElse(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_BREAK:
|
||||
Break(MI);
|
||||
break;
|
||||
case AMDGPU::SI_BREAK:
|
||||
emitBreak(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_IF_BREAK:
|
||||
IfBreak(MI);
|
||||
break;
|
||||
case AMDGPU::SI_IF_BREAK:
|
||||
emitIfBreak(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_ELSE_BREAK:
|
||||
ElseBreak(MI);
|
||||
break;
|
||||
case AMDGPU::SI_ELSE_BREAK:
|
||||
emitElseBreak(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_LOOP:
|
||||
++Depth;
|
||||
Loop(MI);
|
||||
break;
|
||||
case AMDGPU::SI_LOOP:
|
||||
emitLoop(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_END_CF:
|
||||
if (--Depth == 0 && HaveKill) {
|
||||
HaveKill = false;
|
||||
// TODO: Insert skip if exec is 0?
|
||||
}
|
||||
case AMDGPU::SI_END_CF:
|
||||
emitEndCf(MI);
|
||||
break;
|
||||
|
||||
EndCf(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_KILL_TERMINATOR:
|
||||
if (Depth == 0) {
|
||||
if (skipIfDead(MI, *NextBB)) {
|
||||
NextBB = std::next(BI);
|
||||
BE = MF.end();
|
||||
}
|
||||
} else
|
||||
HaveKill = true;
|
||||
Kill(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::S_BRANCH:
|
||||
Branch(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_RETURN: {
|
||||
assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
|
||||
|
||||
// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
|
||||
// because external bytecode will be appended at the end.
|
||||
if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
|
||||
// SI_RETURN is not the last instruction. Add an empty block at
|
||||
// the end and jump there.
|
||||
if (!EmptyMBBAtEnd) {
|
||||
EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
|
||||
MF.insert(MF.end(), EmptyMBBAtEnd);
|
||||
}
|
||||
|
||||
MBB.addSuccessor(EmptyMBBAtEnd);
|
||||
BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
|
||||
.addMBB(EmptyMBBAtEnd);
|
||||
I->eraseFromParent();
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -25,11 +25,13 @@ end:
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}else_execfix_leave_wqm:
|
||||
; CHECK: ; BB#0:
|
||||
; CHECK-NEXT: s_mov_b64 [[INIT_EXEC:s\[[0-9]+:[0-9]+\]]], exec
|
||||
; CHECK: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
|
||||
; CHECK-NEXT: s_and_b64 exec, exec,
|
||||
; CHECK-NEXT: s_and_b64 [[DST]], exec, [[DST]]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
|
||||
; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
|
||||
; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
|
||||
; CHECK-NEXT: ; mask branch
|
||||
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) {
|
||||
main_body:
|
||||
|
@ -2,11 +2,33 @@
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
|
||||
; SI-LABEL: @test_if
|
||||
; SI-LABEL: {{^}}test_if:
|
||||
; Make sure the i1 values created by the cfg structurizer pass are
|
||||
; moved using VALU instructions
|
||||
|
||||
|
||||
; waitcnt should be inserted after exec modification
|
||||
; SI: v_cmp_lt_i32_e32 vcc, 0,
|
||||
; SI-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; SI-NEXT: s_xor_b64 [[SAVE]], exec, [[SAVE]]
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
|
||||
; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
|
||||
|
||||
; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
|
||||
; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
|
||||
; SI: v_mov_b32_e32 v{{[0-9]}}, -1
|
||||
; SI: s_and_saveexec_b64
|
||||
; SI-NEXT: s_xor_b64
|
||||
; SI-NEXT: ; mask branch
|
||||
|
||||
; v_mov should be after exec modification
|
||||
; SI: [[FLOW_BB]]:
|
||||
; SI-NEXT: s_or_saveexec_b64 [[SAVE]], [[SAVE]]
|
||||
; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
|
||||
; SI-NEXT: s_xor_b64 exec, exec, [[SAVE]]
|
||||
; SI-NEXT: ; mask branch
|
||||
;
|
||||
define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
@ -17,12 +39,12 @@ entry:
|
||||
|
||||
case0:
|
||||
%arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
|
||||
store i32 0, i32 addrspace(1)* %arrayidx1, align 4
|
||||
store i32 13, i32 addrspace(1)* %arrayidx1, align 4
|
||||
br label %end
|
||||
|
||||
case1:
|
||||
%arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
|
||||
store i32 1, i32 addrspace(1)* %arrayidx5, align 4
|
||||
store i32 17, i32 addrspace(1)* %arrayidx5, align 4
|
||||
br label %end
|
||||
|
||||
default:
|
||||
@ -31,11 +53,11 @@ default:
|
||||
br i1 %cmp8, label %if, label %else
|
||||
|
||||
if:
|
||||
store i32 2, i32 addrspace(1)* %arrayidx10, align 4
|
||||
store i32 19, i32 addrspace(1)* %arrayidx10, align 4
|
||||
br label %end
|
||||
|
||||
else:
|
||||
store i32 3, i32 addrspace(1)* %arrayidx10, align 4
|
||||
store i32 21, i32 addrspace(1)* %arrayidx10, align 4
|
||||
br label %end
|
||||
|
||||
end:
|
||||
@ -139,10 +161,11 @@ exit:
|
||||
; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
|
||||
|
||||
; SI: [[LABEL_FLOW]]:
|
||||
; SI: s_or_b64 exec, exec, [[ORNEG2]]
|
||||
; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
|
||||
; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
|
||||
; SI: s_cbranch_execnz [[LABEL_LOOP]]
|
||||
; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
|
||||
; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
|
||||
; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
|
||||
; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
|
||||
; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
|
||||
|
||||
; SI: BB#5
|
||||
; SI: s_or_b64 exec, exec, [[COND_STATE]]
|
||||
|
Loading…
x
Reference in New Issue
Block a user