mirror of
https://github.com/RPCS3/llvm.git
synced 2025-04-04 06:12:18 +00:00
AMDGPU/SI: Handle wait states required for DPP instructions
Reviewers: arsenm Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D17543 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@263447 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d26914da05
commit
f53246799f
@ -119,6 +119,18 @@ private:
|
|||||||
/// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
|
/// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
|
||||||
void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
|
void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
|
||||||
|
|
||||||
|
/// \param DPP The DPP instruction
|
||||||
|
/// \param SearchI The iterator to start look for hazards.
|
||||||
|
/// \param SearchMBB The basic block we are operating on.
|
||||||
|
/// \param WaitStates Then number of wait states that need to be inserted
|
||||||
|
/// When a hazard is detected.
|
||||||
|
void insertDPPWaitStates(MachineBasicBlock::iterator DPP,
|
||||||
|
MachineBasicBlock::reverse_iterator SearchI,
|
||||||
|
MachineBasicBlock *SearchMBB,
|
||||||
|
unsigned WaitStates);
|
||||||
|
|
||||||
|
void insertDPPWaitStates(MachineBasicBlock::iterator DPP);
|
||||||
|
|
||||||
/// Return true if there are LGKM instrucitons that haven't been waited on
|
/// Return true if there are LGKM instrucitons that haven't been waited on
|
||||||
/// yet.
|
/// yet.
|
||||||
bool hasOutstandingLGKM() const;
|
bool hasOutstandingLGKM() const;
|
||||||
@ -480,6 +492,45 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP,
|
||||||
|
MachineBasicBlock::reverse_iterator SearchI,
|
||||||
|
MachineBasicBlock *SearchMBB,
|
||||||
|
unsigned WaitStates) {
|
||||||
|
|
||||||
|
MachineBasicBlock::reverse_iterator E = SearchMBB->rend();
|
||||||
|
|
||||||
|
for (; WaitStates > 0; --WaitStates, ++SearchI) {
|
||||||
|
|
||||||
|
// If we have reached the start of the block, we need to check predecessors.
|
||||||
|
if (SearchI == E) {
|
||||||
|
for (MachineBasicBlock *Pred : SearchMBB->predecessors()) {
|
||||||
|
// We only need to check fall-through blocks. Branch instructions
|
||||||
|
// give us enough wait states.
|
||||||
|
if (Pred->getFirstTerminator() == Pred->end()) {
|
||||||
|
insertDPPWaitStates(DPP, Pred->rbegin(), Pred, WaitStates);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (MachineOperand &Op : SearchI->operands()) {
|
||||||
|
if (!Op.isReg() || !Op.isDef())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (DPP->readsRegister(Op.getReg(), TRI)) {
|
||||||
|
TII->insertWaitStates(DPP, WaitStates);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP) {
|
||||||
|
MachineBasicBlock::reverse_iterator I(DPP);
|
||||||
|
insertDPPWaitStates(DPP, I, DPP->getParent(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
|
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
|
||||||
// around other non-memory instructions.
|
// around other non-memory instructions.
|
||||||
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
||||||
@ -546,6 +597,10 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (TII->isDPP(*I)) {
|
||||||
|
insertDPPWaitStates(I);
|
||||||
|
}
|
||||||
|
|
||||||
// Wait for everything before a barrier.
|
// Wait for everything before a barrier.
|
||||||
if (I->getOpcode() == AMDGPU::S_BARRIER)
|
if (I->getOpcode() == AMDGPU::S_BARRIER)
|
||||||
Changes |= insertWait(MBB, I, LastIssued);
|
Changes |= insertWait(MBB, I, LastIssued);
|
||||||
|
@ -301,6 +301,14 @@ public:
|
|||||||
return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
|
return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isDPP(const MachineInstr &MI) {
|
||||||
|
return MI.getDesc().TSFlags & SIInstrFlags::DPP;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isDPP(uint16_t Opcode) const {
|
||||||
|
return get(Opcode).TSFlags & SIInstrFlags::DPP;
|
||||||
|
}
|
||||||
|
|
||||||
bool isInlineConstant(const APInt &Imm) const;
|
bool isInlineConstant(const APInt &Imm) const;
|
||||||
bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
|
bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
|
||||||
bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
|
bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
|
||||||
|
|
||||||
|
; FIXME: The register allocator / scheduler should be able to avoid these hazards.
|
||||||
|
|
||||||
; VI-LABEL: {{^}}dpp_test:
|
; VI-LABEL: {{^}}dpp_test:
|
||||||
|
; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
|
||||||
|
; VI: s_nop 1
|
||||||
; VI: v_mov_b32_dpp v0, v0 quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
|
; VI: v_mov_b32_dpp v0, v0 quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
|
||||||
define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
|
define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
|
||||||
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
|
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
|
||||||
@ -8,6 +12,51 @@ define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; VI-LABEL: {{^}}dpp_wait_states:
|
||||||
|
; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}}
|
||||||
|
; VI: s_nop 1
|
||||||
|
; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||||
|
; VI: s_nop 1
|
||||||
|
; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||||
|
define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
|
||||||
|
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
|
||||||
|
%tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
|
||||||
|
store i32 %tmp1, i32 addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; VI-LABEL: {{^}}dpp_first_in_bb:
|
||||||
|
; VI: s_nop 1
|
||||||
|
; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||||
|
; VI: s_nop 1
|
||||||
|
; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||||
|
; VI: s_nop 1
|
||||||
|
; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||||
|
define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
|
||||||
|
%cmp = fcmp oeq float %cond, 0.0
|
||||||
|
br i1 %cmp, label %if, label %else
|
||||||
|
|
||||||
|
if:
|
||||||
|
%out_val = load float, float addrspace(1)* %out
|
||||||
|
%if_val = fadd float %a, %out_val
|
||||||
|
br label %endif
|
||||||
|
|
||||||
|
else:
|
||||||
|
%in_val = load float, float addrspace(1)* %in
|
||||||
|
%else_val = fadd float %b, %in_val
|
||||||
|
br label %endif
|
||||||
|
|
||||||
|
endif:
|
||||||
|
%val = phi float [%if_val, %if], [%else_val, %else]
|
||||||
|
%val_i32 = bitcast float %val to i32
|
||||||
|
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0
|
||||||
|
%tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
|
||||||
|
%tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0
|
||||||
|
%tmp_float = bitcast i32 %tmp2 to float
|
||||||
|
store float %tmp_float, float addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0
|
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0
|
||||||
|
|
||||||
attributes #0 = { nounwind readnone convergent }
|
attributes #0 = { nounwind readnone convergent }
|
||||||
|
Loading…
x
Reference in New Issue
Block a user