mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-08 21:10:35 +00:00
R600: Proper insert S_WAITCNT instructions
Some instructions like memory reads/writes are executed asynchronously, so we need to insert S_WAITCNT instructions to block before accessing their results. Previously we have just inserted S_WAITCNT instructions after each async instruction, this patch fixes this and adds a prober insertion pass. Patch by: Christian König Tested-by: Michel Dänzer <michel.daenzer@amd.com> Reviewed-by: Tom Stellard <thomas.stellard@amd.com> Signed-off-by: Christian König <deathsimple@vodafone.de> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172846 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
935a91540b
commit
82d3d4524f
@ -30,6 +30,7 @@ FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
|
||||
FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
|
||||
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
|
||||
FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
|
||||
FunctionPass *createSIInsertWaits(TargetMachine &tm);
|
||||
|
||||
// Passes common to R600 and SI
|
||||
Pass *createAMDGPUStructurizeCFGPass();
|
||||
|
@ -116,6 +116,11 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
|
||||
}
|
||||
|
||||
bool AMDGPUPassConfig::addPostRegAlloc() {
|
||||
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
||||
|
||||
if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
|
||||
addPass(createSIInsertWaits(*TM));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -40,6 +40,7 @@ add_llvm_target(R600CodeGen
|
||||
R600RegisterInfo.cpp
|
||||
SIAnnotateControlFlow.cpp
|
||||
SIAssignInterpRegs.cpp
|
||||
SIInsertWaits.cpp
|
||||
SIInstrInfo.cpp
|
||||
SIISelLowering.cpp
|
||||
SILowerLiteralConstants.cpp
|
||||
|
@ -66,11 +66,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
|
||||
MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
|
||||
MachineBasicBlock::iterator I = MI;
|
||||
|
||||
if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) {
|
||||
AppendS_WAITCNT(MI, *BB, llvm::next(I));
|
||||
return BB;
|
||||
}
|
||||
|
||||
switch (MI->getOpcode()) {
|
||||
default:
|
||||
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
|
||||
@ -141,13 +136,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
|
||||
return BB;
|
||||
}
|
||||
|
||||
void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
|
||||
MachineBasicBlock::iterator I) const {
|
||||
BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT))
|
||||
.addImm(0);
|
||||
}
|
||||
|
||||
|
||||
void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
|
||||
MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
|
||||
BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
|
||||
|
@ -23,11 +23,6 @@ namespace llvm {
|
||||
class SITargetLowering : public AMDGPUTargetLowering {
|
||||
const SIInstrInfo * TII;
|
||||
|
||||
/// Memory reads and writes are syncronized using the S_WAITCNT instruction.
|
||||
/// This function takes the most conservative approach and inserts an
|
||||
/// S_WAITCNT instruction after every read and write.
|
||||
void AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
|
||||
MachineBasicBlock::iterator I) const;
|
||||
void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
|
||||
MachineBasicBlock::iterator I, unsigned Opocde) const;
|
||||
void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
|
||||
|
353
lib/Target/R600/SIInsertWaits.cpp
Normal file
353
lib/Target/R600/SIInsertWaits.cpp
Normal file
@ -0,0 +1,353 @@
|
||||
//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// \brief Insert wait instructions for memory reads and writes.
|
||||
///
|
||||
/// Memory reads and writes are issued asynchronously, so we need to insert
|
||||
/// S_WAITCNT instructions when we want to access any of their results or
|
||||
/// overwrite any register that's used asynchronously.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
/// \brief One variable for each of the hardware counters
|
||||
typedef union {
|
||||
struct {
|
||||
unsigned VM;
|
||||
unsigned EXP;
|
||||
unsigned LGKM;
|
||||
} Named;
|
||||
unsigned Array[3];
|
||||
|
||||
} Counters;
|
||||
|
||||
typedef Counters RegCounters[512];
|
||||
typedef std::pair<unsigned, unsigned> RegInterval;
|
||||
|
||||
class SIInsertWaits : public MachineFunctionPass {
|
||||
|
||||
private:
|
||||
static char ID;
|
||||
const SIInstrInfo *TII;
|
||||
const SIRegisterInfo &TRI;
|
||||
const MachineRegisterInfo *MRI;
|
||||
|
||||
/// \brief Constant hardware limits
|
||||
static const Counters WaitCounts;
|
||||
|
||||
/// \brief Constant zero value
|
||||
static const Counters ZeroCounts;
|
||||
|
||||
/// \brief Counter values we have already waited on.
|
||||
Counters WaitedOn;
|
||||
|
||||
/// \brief Counter values for last instruction issued.
|
||||
Counters LastIssued;
|
||||
|
||||
/// \brief Registers used by async instructions.
|
||||
RegCounters UsedRegs;
|
||||
|
||||
/// \brief Registers defined by async instructions.
|
||||
RegCounters DefinedRegs;
|
||||
|
||||
/// \brief Different export instruction types seen since last wait.
|
||||
unsigned ExpInstrTypesSeen;
|
||||
|
||||
/// \brief Get increment/decrement amount for this instruction.
|
||||
Counters getHwCounts(MachineInstr &MI);
|
||||
|
||||
/// \brief Is operand relevant for async execution?
|
||||
bool isOpRelevant(MachineOperand &Op);
|
||||
|
||||
/// \brief Get register interval an operand affects.
|
||||
RegInterval getRegInterval(MachineOperand &Op);
|
||||
|
||||
/// \brief Handle instructions async components
|
||||
void pushInstruction(MachineInstr &MI);
|
||||
|
||||
/// \brief Insert the actual wait instruction
|
||||
bool insertWait(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I,
|
||||
const Counters &Counts);
|
||||
|
||||
/// \brief Resolve all operand dependencies to counter requirements
|
||||
Counters handleOperands(MachineInstr &MI);
|
||||
|
||||
public:
|
||||
SIInsertWaits(TargetMachine &tm) :
|
||||
MachineFunctionPass(ID),
|
||||
TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
|
||||
TRI(TII->getRegisterInfo()) { }
|
||||
|
||||
virtual bool runOnMachineFunction(MachineFunction &MF);
|
||||
|
||||
const char *getPassName() const {
|
||||
return "SI insert wait instructions";
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // End anonymous namespace
|
||||
|
||||
char SIInsertWaits::ID = 0;
|
||||
|
||||
const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
|
||||
const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
|
||||
|
||||
FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
|
||||
return new SIInsertWaits(tm);
|
||||
}
|
||||
|
||||
Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
|
||||
|
||||
uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
|
||||
Counters Result;
|
||||
|
||||
Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
|
||||
|
||||
// Only consider stores or EXP for EXP_CNT
|
||||
Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
|
||||
(MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
|
||||
|
||||
// LGKM may uses larger values
|
||||
if (TSFlags & SIInstrFlags::LGKM_CNT) {
|
||||
|
||||
MachineOperand &Op = MI.getOperand(0);
|
||||
assert(Op.isReg() && "First LGKM operand must be a register!");
|
||||
|
||||
unsigned Reg = Op.getReg();
|
||||
unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
|
||||
Result.Named.LGKM = Size > 4 ? 2 : 1;
|
||||
|
||||
} else {
|
||||
Result.Named.LGKM = 0;
|
||||
}
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
|
||||
|
||||
// Constants are always irrelevant
|
||||
if (!Op.isReg())
|
||||
return false;
|
||||
|
||||
// Defines are always relevant
|
||||
if (Op.isDef())
|
||||
return true;
|
||||
|
||||
// For exports all registers are relevant
|
||||
MachineInstr &MI = *Op.getParent();
|
||||
if (MI.getOpcode() == AMDGPU::EXP)
|
||||
return true;
|
||||
|
||||
// For stores the stored value is also relevant
|
||||
if (!MI.getDesc().mayStore())
|
||||
return false;
|
||||
|
||||
for (MachineInstr::mop_iterator I = MI.operands_begin(),
|
||||
E = MI.operands_end(); I != E; ++I) {
|
||||
|
||||
if (I->isReg() && I->isUse())
|
||||
return Op.isIdenticalTo(*I);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
|
||||
|
||||
if (!Op.isReg())
|
||||
return std::make_pair(0, 0);
|
||||
|
||||
unsigned Reg = Op.getReg();
|
||||
unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
|
||||
|
||||
assert(Size >= 4);
|
||||
|
||||
RegInterval Result;
|
||||
Result.first = TRI.getEncodingValue(Reg);
|
||||
Result.second = Result.first + Size / 4;
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
void SIInsertWaits::pushInstruction(MachineInstr &MI) {
|
||||
|
||||
// Get the hardware counter increments and sum them up
|
||||
Counters Increment = getHwCounts(MI);
|
||||
unsigned Sum = 0;
|
||||
|
||||
for (unsigned i = 0; i < 3; ++i) {
|
||||
LastIssued.Array[i] += Increment.Array[i];
|
||||
Sum += Increment.Array[i];
|
||||
}
|
||||
|
||||
// If we don't increase anything then that's it
|
||||
if (Sum == 0)
|
||||
return;
|
||||
|
||||
// Remember which export instructions we have seen
|
||||
if (Increment.Named.EXP) {
|
||||
ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
|
||||
}
|
||||
|
||||
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
|
||||
|
||||
MachineOperand &Op = MI.getOperand(i);
|
||||
if (!isOpRelevant(Op))
|
||||
continue;
|
||||
|
||||
RegInterval Interval = getRegInterval(Op);
|
||||
for (unsigned j = Interval.first; j < Interval.second; ++j) {
|
||||
|
||||
// Remember which registers we define
|
||||
if (Op.isDef())
|
||||
DefinedRegs[j] = LastIssued;
|
||||
|
||||
// and which one we are using
|
||||
if (Op.isUse())
|
||||
UsedRegs[j] = LastIssued;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I,
|
||||
const Counters &Required) {
|
||||
|
||||
// End of program? No need to wait on anything
|
||||
if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
|
||||
return false;
|
||||
|
||||
// Figure out if the async instructions execute in order
|
||||
bool Ordered[3];
|
||||
|
||||
// VM_CNT is always ordered
|
||||
Ordered[0] = true;
|
||||
|
||||
// EXP_CNT is unordered if we have both EXP & VM-writes
|
||||
Ordered[1] = ExpInstrTypesSeen == 3;
|
||||
|
||||
// LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
|
||||
Ordered[2] = false;
|
||||
|
||||
// The values we are going to put into the S_WAITCNT instruction
|
||||
Counters Counts = WaitCounts;
|
||||
|
||||
// Do we really need to wait?
|
||||
bool NeedWait = false;
|
||||
|
||||
for (unsigned i = 0; i < 3; ++i) {
|
||||
|
||||
if (Required.Array[i] <= WaitedOn.Array[i])
|
||||
continue;
|
||||
|
||||
NeedWait = true;
|
||||
|
||||
if (Ordered[i]) {
|
||||
unsigned Value = LastIssued.Array[i] - Required.Array[i];
|
||||
|
||||
// adjust the value to the real hardware posibilities
|
||||
Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
|
||||
|
||||
} else
|
||||
Counts.Array[i] = 0;
|
||||
|
||||
// Remember on what we have waited on
|
||||
WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
|
||||
}
|
||||
|
||||
if (!NeedWait)
|
||||
return false;
|
||||
|
||||
// Reset EXP_CNT instruction types
|
||||
if (Counts.Named.EXP == 0)
|
||||
ExpInstrTypesSeen = 0;
|
||||
|
||||
// Build the wait instruction
|
||||
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
|
||||
.addImm((Counts.Named.VM & 0xF) |
|
||||
((Counts.Named.EXP & 0x7) << 4) |
|
||||
((Counts.Named.LGKM & 0x7) << 8));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief helper function for handleOperands
|
||||
static void increaseCounters(Counters &Dst, const Counters &Src) {
|
||||
|
||||
for (unsigned i = 0; i < 3; ++i)
|
||||
Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
|
||||
}
|
||||
|
||||
Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
|
||||
|
||||
Counters Result = ZeroCounts;
|
||||
|
||||
// For each register affected by this
|
||||
// instruction increase the result sequence
|
||||
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
|
||||
|
||||
MachineOperand &Op = MI.getOperand(i);
|
||||
RegInterval Interval = getRegInterval(Op);
|
||||
for (unsigned j = Interval.first; j < Interval.second; ++j) {
|
||||
|
||||
if (Op.isDef())
|
||||
increaseCounters(Result, UsedRegs[j]);
|
||||
|
||||
if (Op.isUse())
|
||||
increaseCounters(Result, DefinedRegs[j]);
|
||||
}
|
||||
}
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
||||
|
||||
bool Changes = false;
|
||||
|
||||
MRI = &MF.getRegInfo();
|
||||
|
||||
WaitedOn = ZeroCounts;
|
||||
LastIssued = ZeroCounts;
|
||||
|
||||
memset(&UsedRegs, 0, sizeof(UsedRegs));
|
||||
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
|
||||
|
||||
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
||||
BI != BE; ++BI) {
|
||||
|
||||
MachineBasicBlock &MBB = *BI;
|
||||
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
|
||||
I != E; ++I) {
|
||||
|
||||
Changes |= insertWait(MBB, I, handleOperands(*I));
|
||||
pushInstruction(*I);
|
||||
}
|
||||
|
||||
// Wait for everything at the end of the MBB
|
||||
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
|
||||
}
|
||||
|
||||
return Changes;
|
||||
}
|
@ -55,7 +55,9 @@ public:
|
||||
namespace SIInstrFlags {
|
||||
enum Flags {
|
||||
// First 4 bits are the instruction encoding
|
||||
NEED_WAIT = 1 << 4
|
||||
VM_CNT = 1 << 4,
|
||||
EXP_CNT = 1 << 5,
|
||||
LGKM_CNT = 1 << 6
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -42,11 +42,14 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
AMDGPUInst<outs, ins, asm, pattern> {
|
||||
|
||||
field bits<4> EncodingType = 0;
|
||||
field bits<1> NeedWait = 0;
|
||||
field bits<1> VM_CNT = 0;
|
||||
field bits<1> EXP_CNT = 0;
|
||||
field bits<1> LGKM_CNT = 0;
|
||||
|
||||
let TSFlags{3-0} = EncodingType;
|
||||
let TSFlags{4} = NeedWait;
|
||||
|
||||
let TSFlags{4} = VM_CNT;
|
||||
let TSFlags{5} = EXP_CNT;
|
||||
let TSFlags{6} = LGKM_CNT;
|
||||
}
|
||||
|
||||
class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
@ -140,8 +143,7 @@ def EXP : Enc64<
|
||||
let Inst{63-56} = VSRC3;
|
||||
let EncodingType = 0; //SIInstrEncodingType::EXP
|
||||
|
||||
let NeedWait = 1;
|
||||
let usesCustomInserter = 1;
|
||||
let EXP_CNT = 1;
|
||||
}
|
||||
|
||||
class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
@ -174,11 +176,10 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
let Inst{47-40} = VDATA;
|
||||
let Inst{52-48} = SRSRC;
|
||||
let Inst{57-53} = SSAMP;
|
||||
|
||||
let EncodingType = 2; //SIInstrEncodingType::MIMG
|
||||
|
||||
let NeedWait = 1;
|
||||
let usesCustomInserter = 1;
|
||||
let VM_CNT = 1;
|
||||
let EXP_CNT = 1;
|
||||
}
|
||||
|
||||
class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
@ -215,8 +216,9 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
let Inst{63-56} = SOFFSET;
|
||||
let EncodingType = 3; //SIInstrEncodingType::MTBUF
|
||||
|
||||
let NeedWait = 1;
|
||||
let usesCustomInserter = 1;
|
||||
let VM_CNT = 1;
|
||||
let EXP_CNT = 1;
|
||||
|
||||
let neverHasSideEffects = 1;
|
||||
}
|
||||
|
||||
@ -252,8 +254,9 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
let Inst{63-56} = SOFFSET;
|
||||
let EncodingType = 4; //SIInstrEncodingType::MUBUF
|
||||
|
||||
let NeedWait = 1;
|
||||
let usesCustomInserter = 1;
|
||||
let VM_CNT = 1;
|
||||
let EXP_CNT = 1;
|
||||
|
||||
let neverHasSideEffects = 1;
|
||||
}
|
||||
|
||||
@ -276,8 +279,7 @@ class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
let Inst{31-27} = 0x18; //encoding
|
||||
let EncodingType = 5; //SIInstrEncodingType::SMRD
|
||||
|
||||
let NeedWait = 1;
|
||||
let usesCustomInserter = 1;
|
||||
let LGKM_CNT = 1;
|
||||
}
|
||||
|
||||
class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
|
Loading…
Reference in New Issue
Block a user