mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-27 15:57:30 +00:00
AMDGPU: Define raw/struct variants of buffer atomic fadd
Somehow the new FP atomic buffer intrinsics ended up using the legacy style for buffer intrinsics.
This commit is contained in:
parent
2f97a6b627
commit
f305dea485
@ -973,9 +973,9 @@ class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
|
||||
def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
|
||||
def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
|
||||
|
||||
class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
|
||||
[data_ty],
|
||||
[LLVMMatchType<0>, // vdata(VGPR)
|
||||
class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = 0> : Intrinsic <
|
||||
!if(NoRtn, [], [data_ty]),
|
||||
[!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR)
|
||||
llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
@ -1005,9 +1005,12 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
|
||||
[ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<2, 0>;
|
||||
|
||||
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
|
||||
[data_ty],
|
||||
[LLVMMatchType<0>, // vdata(VGPR)
|
||||
// gfx908 intrinsic
|
||||
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty, /*NoRtn*/1>;
|
||||
|
||||
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = 0> : Intrinsic <
|
||||
!if(NoRtn, [], [data_ty]),
|
||||
[!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR)
|
||||
llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
@ -1039,6 +1042,10 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
|
||||
[ImmArg<ArgIndex<6>>, IntrWillReturn], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<2, 0>;
|
||||
|
||||
// gfx908 intrinsic
|
||||
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty, /*NoRtn*/1>;
|
||||
|
||||
|
||||
// Obsolescent tbuffer intrinsics.
|
||||
def int_amdgcn_tbuffer_load : Intrinsic <
|
||||
[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
|
||||
@ -1804,9 +1811,11 @@ class AMDGPUGlobalAtomicNoRtn : Intrinsic <
|
||||
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "",
|
||||
[SDNPMemOperand]>;
|
||||
|
||||
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn;
|
||||
def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn;
|
||||
|
||||
// Legacy form of the intrinsic. raw and struct forms should be preferred.
|
||||
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn;
|
||||
|
||||
// llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
|
||||
def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">,
|
||||
Intrinsic<[llvm_v32f32_ty],
|
||||
|
@ -204,6 +204,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_OR, SIbuffer_atomic_or>;
|
||||
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
|
||||
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
|
||||
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
|
||||
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
|
||||
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
|
||||
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
|
||||
|
||||
|
@ -3647,6 +3647,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
|
||||
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
|
||||
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
|
||||
default:
|
||||
llvm_unreachable("unhandled atomic opcode");
|
||||
}
|
||||
@ -3657,12 +3660,20 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
|
||||
Intrinsic::ID IID) const {
|
||||
const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
|
||||
IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
|
||||
const bool HasReturn = MI.getNumExplicitDefs() != 0;
|
||||
|
||||
Register Dst = MI.getOperand(0).getReg();
|
||||
Register VData = MI.getOperand(2).getReg();
|
||||
Register Dst;
|
||||
|
||||
Register CmpVal;
|
||||
int OpOffset = 0;
|
||||
if (HasReturn) {
|
||||
// A few FP atomics do not support return values.
|
||||
Dst = MI.getOperand(0).getReg();
|
||||
} else {
|
||||
OpOffset = -1;
|
||||
}
|
||||
|
||||
Register VData = MI.getOperand(2 + OpOffset).getReg();
|
||||
Register CmpVal;
|
||||
|
||||
if (IsCmpSwap) {
|
||||
CmpVal = MI.getOperand(3 + OpOffset).getReg();
|
||||
@ -3670,7 +3681,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
|
||||
}
|
||||
|
||||
Register RSrc = MI.getOperand(3 + OpOffset).getReg();
|
||||
const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
|
||||
const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
|
||||
|
||||
// The struct intrinsic variants add one additional operand over raw.
|
||||
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
|
||||
@ -3695,9 +3706,12 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
|
||||
if (!VIndex)
|
||||
VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
|
||||
|
||||
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
|
||||
.addDef(Dst)
|
||||
.addUse(VData); // vdata
|
||||
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
|
||||
|
||||
if (HasReturn)
|
||||
MIB.addDef(Dst);
|
||||
|
||||
MIB.addUse(VData); // vdata
|
||||
|
||||
if (IsCmpSwap)
|
||||
MIB.addReg(CmpVal);
|
||||
@ -4462,6 +4476,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
|
||||
return legalizeBufferAtomic(MI, B, IntrID);
|
||||
|
@ -2957,6 +2957,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
||||
executeInWaterfallLoop(MI, MRI, {2, 5});
|
||||
return;
|
||||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
|
||||
applyDefaultMapping(OpdMapper);
|
||||
executeInWaterfallLoop(MI, MRI, {1, 4});
|
||||
return;
|
||||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
|
||||
applyDefaultMapping(OpdMapper);
|
||||
executeInWaterfallLoop(MI, MRI, {3, 6});
|
||||
@ -3933,6 +3938,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
// initialized.
|
||||
break;
|
||||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
|
||||
// vdata_in
|
||||
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
|
||||
|
||||
// rsrc
|
||||
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
|
||||
|
||||
// vindex
|
||||
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
|
||||
|
||||
// voffset
|
||||
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
|
||||
|
||||
// soffset
|
||||
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
|
||||
break;
|
||||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
|
||||
// vdata_out
|
||||
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
|
||||
|
@ -225,6 +225,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
|
||||
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
|
||||
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
|
||||
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
|
||||
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
|
||||
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
|
||||
@ -238,6 +239,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
|
||||
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
|
||||
def : SourceOfDivergence<int_amdgcn_ps_live>;
|
||||
|
@ -1400,24 +1400,24 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
|
||||
(name vt:$vdata_in, v4i32:$rsrc, 0,
|
||||
0, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, 0),
|
||||
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
|
||||
(as_i16imm $offset), (extract_slc $cachepolicy))
|
||||
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
>;
|
||||
|
||||
def : GCNPat<
|
||||
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
|
||||
0, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, timm),
|
||||
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
|
||||
(as_i16imm $offset), (extract_slc $cachepolicy))
|
||||
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
>;
|
||||
|
||||
def : GCNPat<
|
||||
(name vt:$vdata_in, v4i32:$rsrc, 0,
|
||||
i32:$voffset, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, 0),
|
||||
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
|
||||
(as_i16imm $offset), (extract_slc $cachepolicy))
|
||||
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
>;
|
||||
|
||||
def : GCNPat<
|
||||
@ -1425,9 +1425,9 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
|
||||
i32:$voffset, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, timm),
|
||||
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
|
||||
$vdata_in,
|
||||
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
|
||||
$rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
|
||||
getVregSrcForVT<vt>.ret:$vdata_in,
|
||||
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
|
||||
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
>;
|
||||
}
|
||||
|
||||
|
@ -1082,8 +1082,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||
Info.flags |= MachineMemOperand::MOStore;
|
||||
} else {
|
||||
// Atomic
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::getVT(CI.getType());
|
||||
Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
|
||||
ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
|
||||
Info.flags = MachineMemOperand::MOLoad |
|
||||
MachineMemOperand::MOStore |
|
||||
MachineMemOperand::MODereferenceable;
|
||||
@ -7062,7 +7063,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
|
||||
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
|
||||
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
|
||||
return lowerStructBufferAtomicIntrin(Op, DAG,
|
||||
AMDGPUISD::BUFFER_ATOMIC_SWAP);
|
||||
@ -7485,7 +7485,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
||||
M->getMemoryVT(), M->getMemOperand());
|
||||
}
|
||||
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
|
||||
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
|
||||
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
|
||||
case Intrinsic::amdgcn_buffer_atomic_fadd: {
|
||||
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
|
||||
unsigned IdxEn = 1;
|
||||
|
@ -2410,8 +2410,8 @@ def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
|
||||
def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
|
||||
}
|
||||
|
||||
class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
|
||||
let OutOperandList = (outs type0:$dst);
|
||||
class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction {
|
||||
let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst));
|
||||
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
|
||||
type2:$soffset, untyped_imm_0:$offset,
|
||||
untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
|
||||
@ -2432,6 +2432,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
|
||||
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
|
||||
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
|
||||
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
|
||||
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction<1/*NoRtn*/>;
|
||||
|
||||
def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
|
||||
let OutOperandList = (outs type0:$dst);
|
||||
|
@ -0,0 +1,245 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
; Natural mapping
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
%voffset.add = add i32 %voffset, 4095
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Natural mapping, no voffset
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; All operands need regbank legalization
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: successors: %bb.2(0x80000000)
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
|
||||
; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
|
||||
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
|
||||
; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
|
||||
; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
|
||||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
|
||||
; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
|
||||
; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
|
||||
; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
|
||||
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
|
||||
; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
|
||||
; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
|
||||
; CHECK: bb.3:
|
||||
; CHECK: successors: %bb.4(0x80000000)
|
||||
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; CHECK: bb.4:
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; All operands need regbank legalization, no voffset
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: successors: %bb.2(0x80000000)
|
||||
; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
|
||||
; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
|
||||
; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
|
||||
; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
|
||||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
|
||||
; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec
|
||||
; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
|
||||
; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
|
||||
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
|
||||
; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
|
||||
; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
|
||||
; CHECK: bb.3:
|
||||
; CHECK: successors: %bb.4(0x80000000)
|
||||
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; CHECK: bb.4:
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
%voffset = add i32 %voffset.base, 4095
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Natural mapping + slc
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
@ -0,0 +1,260 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
; Natural mapping
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
%voffset.add = add i32 %voffset, 4095
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Natural mapping, no voffset
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; All register operands need legalization
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: successors: %bb.2(0x80000000)
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
|
||||
; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
|
||||
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
|
||||
; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
|
||||
; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
|
||||
; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
|
||||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
|
||||
; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec
|
||||
; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
|
||||
; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec
|
||||
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
|
||||
; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
|
||||
; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
|
||||
; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
|
||||
; CHECK: bb.3:
|
||||
; CHECK: successors: %bb.4(0x80000000)
|
||||
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; CHECK: bb.4:
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; All register operands need legalization, no voffset
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: successors: %bb.2(0x80000000)
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
|
||||
; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
|
||||
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
|
||||
; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
|
||||
; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
|
||||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
|
||||
; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
|
||||
; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
|
||||
; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
|
||||
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
|
||||
; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
|
||||
; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
|
||||
; CHECK: bb.3:
|
||||
; CHECK: successors: %bb.4(0x80000000)
|
||||
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; CHECK: bb.4:
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Natural mapping + slc
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
72
test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
Normal file
72
test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
Normal file
@ -0,0 +1,72 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s6
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s6 offen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s6 offset:92
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen slc
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
61
test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll
Normal file
61
test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll
Normal file
@ -0,0 +1,61 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK
|
||||
|
||||
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Natural mapping, no voffset
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 idxen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen slc
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; CHECK-LABEL: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s11, s5
|
||||
; CHECK-NEXT: s_mov_b32 s10, s4
|
||||
; CHECK-NEXT: s_mov_b32 s9, s3
|
||||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s6 idxen offen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
Loading…
x
Reference in New Issue
Block a user