mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-24 20:29:53 +00:00
AMDGPU/SI: Add llvm.amdgcn.s.waitcnt.all intrinsic
Summary: So it appears that to guarantee some of the ordering requirements of a GLSL memoryBarrier() executed in the shader, we need to emit an s_waitcnt. (We can't use an s_barrier, because memoryBarrier() may appear anywhere in the shader, in particular it may appear in non-uniform control flow.) Reviewers: arsenm, mareko, tstellarAMD Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D19203 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@267729 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d0229876a9
commit
0493c734a2
@ -68,6 +68,8 @@ defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz <
|
||||
def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
|
||||
Intrinsic<[], [], [IntrConvergent]>;
|
||||
|
||||
def int_amdgcn_s_waitcnt : Intrinsic<[], [llvm_i32_ty], []>;
|
||||
|
||||
def int_amdgcn_div_scale : Intrinsic<
|
||||
// 1st parameter: Numerator
|
||||
// 2nd parameter: Denominator
|
||||
|
@ -68,6 +68,10 @@ private:
|
||||
/// \brief Counter values we have already waited on.
|
||||
Counters WaitedOn;
|
||||
|
||||
/// \brief Counter values that we must wait on before the next counter
|
||||
/// increase.
|
||||
Counters DelayedWaitOn;
|
||||
|
||||
/// \brief Counter values for last instruction issued.
|
||||
Counters LastIssued;
|
||||
|
||||
@ -103,13 +107,17 @@ private:
|
||||
|
||||
/// \brief Handle instructions async components
|
||||
void pushInstruction(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I);
|
||||
MachineBasicBlock::iterator I,
|
||||
const Counters& Increment);
|
||||
|
||||
/// \brief Insert the actual wait instruction
|
||||
bool insertWait(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I,
|
||||
const Counters &Counts);
|
||||
|
||||
/// \brief Handle existing wait instructions (from intrinsics)
|
||||
void handleExistingWait(MachineBasicBlock::iterator I);
|
||||
|
||||
/// \brief Do we need def2def checks?
|
||||
bool unorderedDefines(MachineInstr &MI);
|
||||
|
||||
@ -287,10 +295,10 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
|
||||
}
|
||||
|
||||
void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator I) {
|
||||
MachineBasicBlock::iterator I,
|
||||
const Counters &Increment) {
|
||||
|
||||
// Get the hardware counter increments and sum them up
|
||||
Counters Increment = getHwCounts(*I);
|
||||
Counters Limit = ZeroCounts;
|
||||
unsigned Sum = 0;
|
||||
|
||||
@ -430,16 +438,38 @@ static void increaseCounters(Counters &Dst, const Counters &Src) {
|
||||
Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
|
||||
}
|
||||
|
||||
/// \brief check whether any of the counters is non-zero
|
||||
static bool countersNonZero(const Counters &Counter) {
|
||||
for (unsigned i = 0; i < 3; ++i)
|
||||
if (Counter.Array[i])
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
|
||||
assert(I->getOpcode() == AMDGPU::S_WAITCNT);
|
||||
|
||||
unsigned Imm = I->getOperand(0).getImm();
|
||||
Counters Counts, WaitOn;
|
||||
|
||||
Counts.Named.VM = Imm & 0xF;
|
||||
Counts.Named.EXP = (Imm >> 4) & 0x7;
|
||||
Counts.Named.LGKM = (Imm >> 8) & 0xF;
|
||||
|
||||
for (unsigned i = 0; i < 3; ++i) {
|
||||
if (Counts.Array[i] <= LastIssued.Array[i])
|
||||
WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
|
||||
else
|
||||
WaitOn.Array[i] = 0;
|
||||
}
|
||||
|
||||
increaseCounters(DelayedWaitOn, WaitOn);
|
||||
}
|
||||
|
||||
Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
|
||||
|
||||
Counters Result = ZeroCounts;
|
||||
|
||||
// S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
|
||||
// but we also want to wait for any other outstanding transfers before
|
||||
// signalling other hardware blocks
|
||||
if (MI.getOpcode() == AMDGPU::S_SENDMSG)
|
||||
return LastIssued;
|
||||
|
||||
// For each register affected by this instruction increase the result
|
||||
// sequence.
|
||||
//
|
||||
@ -544,6 +574,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
||||
MRI = &MF.getRegInfo();
|
||||
|
||||
WaitedOn = ZeroCounts;
|
||||
DelayedWaitOn = ZeroCounts;
|
||||
LastIssued = ZeroCounts;
|
||||
LastOpcodeType = OTHER;
|
||||
LastInstWritesM0 = false;
|
||||
@ -552,6 +583,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
||||
memset(&UsedRegs, 0, sizeof(UsedRegs));
|
||||
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
|
||||
|
||||
SmallVector<MachineInstr *, 4> RemoveMI;
|
||||
|
||||
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
|
||||
BI != BE; ++BI) {
|
||||
|
||||
@ -607,13 +640,34 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
||||
I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
|
||||
TII->insertWaitStates(MBB, std::next(I), 4);
|
||||
|
||||
// Wait for everything before a barrier.
|
||||
if (I->getOpcode() == AMDGPU::S_BARRIER)
|
||||
Changes |= insertWait(MBB, I, LastIssued);
|
||||
else
|
||||
Changes |= insertWait(MBB, I, handleOperands(*I));
|
||||
// Record pre-existing, explicitly requested waits
|
||||
if (I->getOpcode() == AMDGPU::S_WAITCNT) {
|
||||
handleExistingWait(*I);
|
||||
RemoveMI.push_back(I);
|
||||
continue;
|
||||
}
|
||||
|
||||
pushInstruction(MBB, I);
|
||||
Counters Required;
|
||||
|
||||
// Wait for everything before a barrier.
|
||||
//
|
||||
// S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
|
||||
// but we also want to wait for any other outstanding transfers before
|
||||
// signalling other hardware blocks
|
||||
if (I->getOpcode() == AMDGPU::S_BARRIER ||
|
||||
I->getOpcode() == AMDGPU::S_SENDMSG)
|
||||
Required = LastIssued;
|
||||
else
|
||||
Required = handleOperands(*I);
|
||||
|
||||
Counters Increment = getHwCounts(*I);
|
||||
|
||||
if (countersNonZero(Required) || countersNonZero(Increment))
|
||||
increaseCounters(Required, DelayedWaitOn);
|
||||
|
||||
Changes |= insertWait(MBB, I, Required);
|
||||
|
||||
pushInstruction(MBB, I, Increment);
|
||||
handleSendMsg(MBB, I);
|
||||
}
|
||||
|
||||
@ -621,5 +675,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
||||
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
|
||||
}
|
||||
|
||||
for (MachineInstr *I : RemoveMI)
|
||||
I->eraseFromParent();
|
||||
|
||||
return Changes;
|
||||
}
|
||||
|
@ -43,8 +43,9 @@ def SWaitMatchClass : AsmOperandClass {
|
||||
let ParserMethod = "parseSWaitCntOps";
|
||||
}
|
||||
|
||||
def WAIT_FLAG : InstFlag<"printWaitFlag"> {
|
||||
def WAIT_FLAG : Operand <i32> {
|
||||
let ParserMatchClass = SWaitMatchClass;
|
||||
let PrintMethod = "printWaitFlag";
|
||||
}
|
||||
|
||||
let SubtargetPredicate = isGCN in {
|
||||
@ -506,6 +507,7 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
|
||||
let isConvergent = 1;
|
||||
}
|
||||
|
||||
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
|
||||
def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
|
||||
def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
|
||||
|
||||
@ -2452,6 +2454,11 @@ def : Pat <
|
||||
// SOPP Patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def : Pat <
|
||||
(int_amdgcn_s_waitcnt i32:$simm16),
|
||||
(S_WAITCNT (as_i16imm $simm16))
|
||||
>;
|
||||
|
||||
// FIXME: These should be removed eventually
|
||||
def : Pat <
|
||||
(int_AMDGPU_barrier_global),
|
||||
|
38
test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
Normal file
38
test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
Normal file
@ -0,0 +1,38 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
|
||||
|
||||
; CHECK-LABEL: {{^}}test1:
|
||||
; CHECK: image_store
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}}
|
||||
; CHECK-NEXT: image_store
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) {
|
||||
call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
|
||||
call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
|
||||
call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test that the intrinsic is merged with automatically generated waits and
|
||||
; emitted as late as possible.
|
||||
;
|
||||
; CHECK-LABEL: {{^}}test2:
|
||||
; CHECK: image_load
|
||||
; CHECK-NOT: s_waitcnt vmcnt(0){{$}}
|
||||
; CHECK: s_waitcnt
|
||||
; CHECK-NEXT: image_store
|
||||
define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) {
|
||||
%t = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
|
||||
call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
|
||||
%c.1 = mul i32 %c, 2
|
||||
call void @llvm.amdgcn.image.store.i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.s.waitcnt(i32) #0
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
|
||||
declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readonly }
|
Loading…
Reference in New Issue
Block a user