[AMDGPU] Optimize atomic max/min

Summary:
Extend the atomic optimizer to handle signed and unsigned max and min
operations, as well as add and subtract.

Reviewers: arsenm, sheredom, critson, rampitec

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, jfb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D64328

llvm-svn: 366235
This commit is contained in:
Jay Foad 2019-07-16 17:44:54 +00:00
parent 4fbfb73f20
commit 9e0fb9bdee
2 changed files with 248 additions and 35 deletions

View File

@ -40,7 +40,7 @@ enum DPP_CTRL {
struct ReplacementInfo {
Instruction *I;
Instruction::BinaryOps Op;
AtomicRMWInst::BinOp Op;
unsigned ValIdx;
bool ValDivergent;
};
@ -55,8 +55,8 @@ private:
bool HasDPP;
bool IsPixelShader;
void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
unsigned ValIdx, bool ValDivergent) const;
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
bool ValDivergent) const;
public:
static char ID;
@ -120,16 +120,17 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
break;
}
Instruction::BinaryOps Op;
AtomicRMWInst::BinOp Op = I.getOperation();
switch (I.getOperation()) {
switch (Op) {
default:
return;
case AtomicRMWInst::Add:
Op = Instruction::Add;
break;
case AtomicRMWInst::Sub:
Op = Instruction::Sub;
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
break;
}
@ -161,7 +162,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
}
void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
Instruction::BinaryOps Op;
AtomicRMWInst::BinOp Op;
switch (I.getIntrinsicID()) {
default:
@ -169,12 +170,32 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
Op = Instruction::Add;
Op = AtomicRMWInst::Add;
break;
case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
Op = Instruction::Sub;
Op = AtomicRMWInst::Sub;
break;
case Intrinsic::amdgcn_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
Op = AtomicRMWInst::Min;
break;
case Intrinsic::amdgcn_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
Op = AtomicRMWInst::UMin;
break;
case Intrinsic::amdgcn_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
Op = AtomicRMWInst::Max;
break;
case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
Op = AtomicRMWInst::UMax;
break;
}
@ -206,8 +227,57 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
ToReplace.push_back(Info);
}
// Use the builder to create the non-atomic counterpart of the specified
// atomicrmw binary op.
static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
Value *LHS, Value *RHS) {
CmpInst::Predicate Pred;
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
return B.CreateBinOp(Instruction::Add, LHS, RHS);
case AtomicRMWInst::Sub:
return B.CreateBinOp(Instruction::Sub, LHS, RHS);
case AtomicRMWInst::Max:
Pred = CmpInst::ICMP_SGT;
break;
case AtomicRMWInst::Min:
Pred = CmpInst::ICMP_SLT;
break;
case AtomicRMWInst::UMax:
Pred = CmpInst::ICMP_UGT;
break;
case AtomicRMWInst::UMin:
Pred = CmpInst::ICMP_ULT;
break;
}
Value *Cond = B.CreateICmp(Pred, LHS, RHS);
return B.CreateSelect(Cond, LHS, RHS);
}
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
unsigned BitWidth) {
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
case AtomicRMWInst::UMax:
return APInt::getMinValue(BitWidth);
case AtomicRMWInst::UMin:
return APInt::getMaxValue(BitWidth);
case AtomicRMWInst::Max:
return APInt::getSignedMinValue(BitWidth);
case AtomicRMWInst::Min:
return APInt::getSignedMaxValue(BitWidth);
}
}
void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Instruction::BinaryOps Op,
AtomicRMWInst::BinOp Op,
unsigned ValIdx,
bool ValDivergent) const {
// Start building just before the instruction.
@ -266,16 +336,16 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
Value *LaneOffset = nullptr;
Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
Value *ExclScan = nullptr;
Value *NewV = nullptr;
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
Value *const Identity = B.getIntN(TyBitWidth, 0);
// First we need to set all inactive invocations to 0, so that they can
// correctly contribute to the final result.
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
CallInst *const SetInactive =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
@ -283,7 +353,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
{Identity, SetInactive, B.getInt32(DPP_WF_SR1),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
NewV = FirstDPP;
ExclScan = FirstDPP;
const unsigned Iters = 7;
const unsigned DPPCtrl[Iters] = {
@ -295,21 +365,20 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// This loop performs an exclusive scan across the wavefront, with all lanes
// active (by using the WWM intrinsic).
for (unsigned Idx = 0; Idx < Iters; Idx++) {
Value *const UpdateValue = Idx < 3 ? FirstDPP : NewV;
Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
CallInst *const DPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Ty,
{Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
NewV = B.CreateBinOp(Op, NewV, DPP);
ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
}
LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
NewV = B.CreateBinOp(Op, SetInactive, NewV);
NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
// Read the value from the last lane, which has accumlated the values of
// each active lane in the wavefront. This will be our new value with which
// we will provide to the atomic operation.
// each active lane in the wavefront. This will be our new value which we
// will provide to the atomic operation.
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =
@ -324,9 +393,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
{}, {NewV, B.getInt32(63)});
NewV = ReadLane;
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{NewV, B.getInt32(63)});
} else {
llvm_unreachable("Unhandled atomic bit width");
}
@ -334,14 +402,32 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
} else {
// Get the total number of active lanes we have by using popcount.
Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
// Calculate the new value we will be contributing to the atomic operation
// for the entire wavefront.
NewV = B.CreateMul(V, CtpopCast);
LaneOffset = B.CreateMul(V, MbcntCast);
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub: {
// Get the total number of active lanes we have by using popcount.
Instruction *const Ctpop =
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
// Calculate the new value we will be contributing to the atomic operation
// for the entire wavefront.
NewV = B.CreateMul(V, CtpopCast);
break;
}
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
// Max/min with a uniform value is idempotent: doing the atomic operation
// multiple times has the same effect as doing it once.
NewV = V;
break;
}
}
// We only want a single lane to enter our new control flow, and we do this
@ -407,7 +493,26 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// get our individual lane's slice into the result. We use the lane offset we
// previously calculated combined with the atomic result value we got from the
// first lane, to get our lane's index into the atomic result.
Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
Value *LaneOffset = nullptr;
if (ValDivergent) {
LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
} else {
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
LaneOffset = B.CreateMul(V, MbcntCast);
break;
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
LaneOffset = B.CreateSelect(Cond, Identity, V);
break;
}
}
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
if (IsPixelShader) {
// Need a final PHI to reconverge to above the helper lane branch mask.

View File

@ -194,3 +194,111 @@ entry:
store i64 %old, i64 addrspace(1)* %out
ret void
}
; GCN-LABEL: max_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
store i32 %old, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: max_i64_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
entry:
%old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
store i64 %old, i64 addrspace(1)* %out
ret void
}
; GCN-LABEL: min_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
store i32 %old, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: min_i64_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
entry:
%old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
store i64 %old, i64 addrspace(1)* %out
ret void
}
; GCN-LABEL: umax_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
store i32 %old, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: umax_i64_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
entry:
%old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
store i64 %old, i64 addrspace(1)* %out
ret void
}
; GCN-LABEL: umin_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
store i32 %old, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: umin_i64_constant:
; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
entry:
%old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
store i64 %old, i64 addrspace(1)* %out
ret void
}