mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-10 05:41:40 +00:00
[AMDGPU] Optimize atomic AND/OR/XOR
Summary: Extend the atomic optimizer to handle AND, OR and XOR. Reviewers: arsenm, sheredom Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64809 llvm-svn: 366323
This commit is contained in:
parent
cdfc2797e3
commit
b05f60662f
@ -127,6 +127,9 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
|
||||
return;
|
||||
case AtomicRMWInst::Add:
|
||||
case AtomicRMWInst::Sub:
|
||||
case AtomicRMWInst::And:
|
||||
case AtomicRMWInst::Or:
|
||||
case AtomicRMWInst::Xor:
|
||||
case AtomicRMWInst::Max:
|
||||
case AtomicRMWInst::Min:
|
||||
case AtomicRMWInst::UMax:
|
||||
@ -177,6 +180,21 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
|
||||
Op = AtomicRMWInst::Sub;
|
||||
break;
|
||||
case Intrinsic::amdgcn_buffer_atomic_and:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_and:
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_and:
|
||||
Op = AtomicRMWInst::And;
|
||||
break;
|
||||
case Intrinsic::amdgcn_buffer_atomic_or:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_or:
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_or:
|
||||
Op = AtomicRMWInst::Or;
|
||||
break;
|
||||
case Intrinsic::amdgcn_buffer_atomic_xor:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
|
||||
Op = AtomicRMWInst::Xor;
|
||||
break;
|
||||
case Intrinsic::amdgcn_buffer_atomic_smin:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
|
||||
@ -240,6 +258,12 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
|
||||
return B.CreateBinOp(Instruction::Add, LHS, RHS);
|
||||
case AtomicRMWInst::Sub:
|
||||
return B.CreateBinOp(Instruction::Sub, LHS, RHS);
|
||||
case AtomicRMWInst::And:
|
||||
return B.CreateBinOp(Instruction::And, LHS, RHS);
|
||||
case AtomicRMWInst::Or:
|
||||
return B.CreateBinOp(Instruction::Or, LHS, RHS);
|
||||
case AtomicRMWInst::Xor:
|
||||
return B.CreateBinOp(Instruction::Xor, LHS, RHS);
|
||||
|
||||
case AtomicRMWInst::Max:
|
||||
Pred = CmpInst::ICMP_SGT;
|
||||
@ -265,8 +289,11 @@ static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
|
||||
llvm_unreachable("Unhandled atomic op");
|
||||
case AtomicRMWInst::Add:
|
||||
case AtomicRMWInst::Sub:
|
||||
case AtomicRMWInst::Or:
|
||||
case AtomicRMWInst::Xor:
|
||||
case AtomicRMWInst::UMax:
|
||||
return APInt::getMinValue(BitWidth);
|
||||
case AtomicRMWInst::And:
|
||||
case AtomicRMWInst::UMin:
|
||||
return APInt::getMaxValue(BitWidth);
|
||||
case AtomicRMWInst::Max:
|
||||
@ -331,10 +358,10 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
|
||||
Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
|
||||
CallInst *const PartialMbcnt = B.CreateIntrinsic(
|
||||
Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
|
||||
CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
|
||||
{ExtractHi, PartialMbcnt});
|
||||
|
||||
Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
|
||||
Value *const Mbcnt =
|
||||
B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
|
||||
{ExtractHi, PartialMbcnt}),
|
||||
Ty, false);
|
||||
|
||||
Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
|
||||
|
||||
@ -408,32 +435,39 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
|
||||
|
||||
case AtomicRMWInst::Add:
|
||||
case AtomicRMWInst::Sub: {
|
||||
// Get the total number of active lanes we have by using popcount.
|
||||
Instruction *const Ctpop =
|
||||
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
|
||||
Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
|
||||
|
||||
// Calculate the new value we will be contributing to the atomic operation
|
||||
// for the entire wavefront.
|
||||
NewV = B.CreateMul(V, CtpopCast);
|
||||
// The new value we will be contributing to the atomic operation is the
|
||||
// old value times the number of active lanes.
|
||||
Value *const Ctpop = B.CreateIntCast(
|
||||
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
|
||||
NewV = B.CreateMul(V, Ctpop);
|
||||
break;
|
||||
}
|
||||
|
||||
case AtomicRMWInst::And:
|
||||
case AtomicRMWInst::Or:
|
||||
case AtomicRMWInst::Max:
|
||||
case AtomicRMWInst::Min:
|
||||
case AtomicRMWInst::UMax:
|
||||
case AtomicRMWInst::UMin:
|
||||
// Max/min with a uniform value is idempotent: doing the atomic operation
|
||||
// multiple times has the same effect as doing it once.
|
||||
// These operations with a uniform value are idempotent: doing the atomic
|
||||
// operation multiple times has the same effect as doing it once.
|
||||
NewV = V;
|
||||
break;
|
||||
|
||||
case AtomicRMWInst::Xor:
|
||||
// The new value we will be contributing to the atomic operation is the
|
||||
// old value times the parity of the number of active lanes.
|
||||
Value *const Ctpop = B.CreateIntCast(
|
||||
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
|
||||
NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We only want a single lane to enter our new control flow, and we do this
|
||||
// by checking if there are any active lanes below us. Only one lane will
|
||||
// have 0 active lanes below us, so that will be the only one to progress.
|
||||
Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));
|
||||
Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
|
||||
|
||||
// Store I's original basic block before we split the block.
|
||||
BasicBlock *const EntryBB = I.getParent();
|
||||
@ -502,14 +536,19 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
|
||||
llvm_unreachable("Unhandled atomic op");
|
||||
case AtomicRMWInst::Add:
|
||||
case AtomicRMWInst::Sub:
|
||||
LaneOffset = B.CreateMul(V, MbcntCast);
|
||||
LaneOffset = B.CreateMul(V, Mbcnt);
|
||||
break;
|
||||
case AtomicRMWInst::And:
|
||||
case AtomicRMWInst::Or:
|
||||
case AtomicRMWInst::Max:
|
||||
case AtomicRMWInst::Min:
|
||||
case AtomicRMWInst::UMax:
|
||||
case AtomicRMWInst::UMin:
|
||||
LaneOffset = B.CreateSelect(Cond, Identity, V);
|
||||
break;
|
||||
case AtomicRMWInst::Xor:
|
||||
LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
|
||||
|
@ -195,6 +195,42 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: and_i32_varying:
|
||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||
; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
||||
define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%lane = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
|
||||
store i32 %old, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: or_i32_varying:
|
||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||
; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
||||
define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%lane = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
|
||||
store i32 %old, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: xor_i32_varying:
|
||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||
; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
|
||||
define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%lane = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
|
||||
store i32 %old, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: max_i32_varying:
|
||||
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
|
||||
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
|
||||
|
Loading…
Reference in New Issue
Block a user