mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-13 23:18:58 +00:00
AMDGPU: Select d16 loads into low component of register
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318005 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
19f503cb02
commit
d8f8d0c326
@ -201,6 +201,8 @@ private:
|
||||
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
|
||||
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
|
||||
bool SelectHi16Elt(SDValue In, SDValue &Src) const;
|
||||
|
||||
void SelectADD_SUB_I64(SDNode *N);
|
||||
void SelectUADDO_USUBO(SDNode *N);
|
||||
void SelectDIV_SCALE(SDNode *N);
|
||||
@ -1134,8 +1136,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
|
||||
|
||||
if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
|
||||
unsigned Imm = CAddr->getZExtValue();
|
||||
assert(!SIInstrInfo::isLegalMUBUFImmOffset(Imm) &&
|
||||
"should have been selected by other pattern");
|
||||
|
||||
SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
|
||||
MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
|
||||
@ -2024,6 +2024,35 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO: Can we identify things like v_mad_mixhi_f16?
|
||||
bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
|
||||
if (In.isUndef()) {
|
||||
Src = In;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
|
||||
SDLoc SL(In);
|
||||
SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32);
|
||||
MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
|
||||
SL, MVT::i32, K);
|
||||
Src = SDValue(MovK, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
|
||||
SDLoc SL(In);
|
||||
SDValue K = CurDAG->getTargetConstant(
|
||||
C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
|
||||
MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
|
||||
SL, MVT::i32, K);
|
||||
Src = SDValue(MovK, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
return isExtractHiElt(In, Src);
|
||||
}
|
||||
|
||||
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
|
||||
const AMDGPUTargetLowering& Lowering =
|
||||
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
|
||||
|
@ -133,6 +133,29 @@ def shl_oneuse : HasOneUseBinOp<shl>;
|
||||
|
||||
def select_oneuse : HasOneUseTernaryOp<select>;
|
||||
|
||||
def srl_16 : PatFrag<
|
||||
(ops node:$src0), (srl_oneuse node:$src0, (i32 16))
|
||||
>;
|
||||
|
||||
|
||||
def hi_i16_elt : PatFrag<
|
||||
(ops node:$src0), (i16 (trunc (i32 (srl_16 node:$src0))))
|
||||
>;
|
||||
|
||||
|
||||
def hi_f16_elt : PatLeaf<
|
||||
(vt), [{
|
||||
if (N->getOpcode() != ISD::BITCAST)
|
||||
return false;
|
||||
SDValue Tmp = N->getOperand(0);
|
||||
|
||||
if (Tmp.getOpcode() != ISD::SRL)
|
||||
return false;
|
||||
if (const auto *RHS = dyn_cast<ConstantSDNode>(Tmp.getOperand(1))
|
||||
return RHS->getZExtValue() == 16;
|
||||
return false;
|
||||
}]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// PatLeafs for floating-point comparisons
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -811,7 +811,7 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
|
||||
let SubtargetPredicate = HasD16LoadStore in {
|
||||
|
||||
defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
|
||||
"buffer_load_ubyte_d16", VGPR_32, i32
|
||||
"buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1
|
||||
>;
|
||||
|
||||
defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
|
||||
@ -819,7 +819,7 @@ defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
|
||||
>;
|
||||
|
||||
defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
|
||||
"buffer_load_sbyte_d16", VGPR_32, i32
|
||||
"buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1
|
||||
>;
|
||||
|
||||
defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
|
||||
@ -827,7 +827,7 @@ defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
|
||||
>;
|
||||
|
||||
defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
|
||||
"buffer_load_short_d16", VGPR_32, i32
|
||||
"buffer_load_short_d16", VGPR_32, i32, null_frag, 1
|
||||
>;
|
||||
|
||||
defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
|
||||
@ -1169,6 +1169,36 @@ multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen,
|
||||
>;
|
||||
}
|
||||
|
||||
multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen,
|
||||
MUBUF_Pseudo InstrOffset,
|
||||
ValueType vt, PatFrag ld> {
|
||||
def : GCNPat <
|
||||
(build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
|
||||
i32:$soffset, u16imm:$offset))),
|
||||
(vt (Hi16Elt vt:$hi))),
|
||||
(v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
|
||||
>;
|
||||
|
||||
def : GCNPat <
|
||||
(build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
|
||||
i32:$soffset, u16imm:$offset))))),
|
||||
(f16 (Hi16Elt f16:$hi))),
|
||||
(v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
|
||||
>;
|
||||
|
||||
def : GCNPat <
|
||||
(build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
|
||||
(vt (Hi16Elt vt:$hi))),
|
||||
(v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
|
||||
>;
|
||||
|
||||
def : GCNPat <
|
||||
(build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))),
|
||||
(f16 (Hi16Elt f16:$hi))),
|
||||
(v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
|
||||
>;
|
||||
}
|
||||
|
||||
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
|
||||
defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, az_extloadi8_private>;
|
||||
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
|
||||
@ -1184,6 +1214,10 @@ let OtherPredicates = [HasD16LoadStore] in {
|
||||
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
|
||||
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
|
||||
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
|
||||
|
||||
defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>;
|
||||
defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
|
||||
defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
|
||||
}
|
||||
|
||||
// BUFFER_LOAD_DWORD*, addr64=0
|
||||
|
@ -559,6 +559,19 @@ multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
|
||||
>;
|
||||
}
|
||||
|
||||
multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
|
||||
def : GCNPat <
|
||||
(build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))),
|
||||
(v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi))
|
||||
>;
|
||||
|
||||
def : GCNPat <
|
||||
(build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))),
|
||||
(v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi))
|
||||
>;
|
||||
}
|
||||
|
||||
|
||||
def : DSReadPat <DS_READ_I8, i32, sextloadi8_local_m0>;
|
||||
def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local_m0>;
|
||||
def : DSReadPat <DS_READ_I8, i16, sextloadi8_local_m0>;
|
||||
@ -587,6 +600,11 @@ let AddedComplexity = 100 in {
|
||||
defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
|
||||
defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
|
||||
defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
|
||||
|
||||
defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>;
|
||||
defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>;
|
||||
defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -654,6 +654,30 @@ multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, Val
|
||||
>;
|
||||
}
|
||||
|
||||
multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
|
||||
def : GCNPat <
|
||||
(build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
|
||||
(v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
|
||||
>;
|
||||
|
||||
def : GCNPat <
|
||||
(build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
|
||||
(v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
|
||||
>;
|
||||
}
|
||||
|
||||
multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
|
||||
def : GCNPat <
|
||||
(build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
|
||||
(v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
|
||||
>;
|
||||
|
||||
def : GCNPat <
|
||||
(build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
|
||||
(v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
|
||||
>;
|
||||
}
|
||||
|
||||
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
|
||||
(vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
|
||||
(inst $vaddr, $offset, 0, $slc)
|
||||
@ -765,6 +789,12 @@ defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>;
|
||||
defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>;
|
||||
defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>;
|
||||
}
|
||||
|
||||
let AddedComplexity = 9 in {
|
||||
defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>;
|
||||
defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>;
|
||||
defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>;
|
||||
}
|
||||
}
|
||||
|
||||
} // End OtherPredicates = [HasFlatAddressSpace]
|
||||
@ -801,6 +831,11 @@ def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i3
|
||||
defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>;
|
||||
defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>;
|
||||
defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>;
|
||||
|
||||
defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>;
|
||||
defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>;
|
||||
defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>;
|
||||
|
||||
}
|
||||
|
||||
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
|
||||
|
@ -843,6 +843,9 @@ def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">;
|
||||
|
||||
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
|
||||
|
||||
|
||||
def Hi16Elt : ComplexPattern<untyped, 1, "SelectHi16Elt">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SI assembler operands
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -100,15 +100,16 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
|
||||
; SICIVI: buffer_store_short
|
||||
; SICIVI: buffer_store_short
|
||||
|
||||
; GFX9: buffer_load_ushort
|
||||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_store_short
|
||||
|
||||
; GFX9: buffer_load_ushort
|
||||
; GFX9: global_load_short_d16_hi
|
||||
|
||||
; GFX9: global_load_short_d16 v
|
||||
; GFX9: buffer_store_dword
|
||||
; GFX9: buffer_store_dword
|
||||
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_store_short
|
||||
; GFX9: buffer_load_ushort
|
||||
; GFX9: buffer_store_short
|
||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
|
||||
%p0 = extractelement <3 x i16> %foo, i32 %idx
|
||||
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
|
||||
|
@ -503,4 +503,102 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Remove m0 init and waitcnt between reads
|
||||
; FIXME: Is there a cost to using the extload over not?
|
||||
; GCN-LABEL: {{^}}load_local_v2i16_split:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: s_mov_b32 m0, -1
|
||||
; GFX9-NEXT: ds_read_u16 v1, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
|
||||
%load0 = load volatile i16, i16 addrspace(3)* %in
|
||||
%load1 = load volatile i16, i16 addrspace(3)* %gep
|
||||
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
|
||||
ret <2 x i16> %build1
|
||||
}
|
||||
|
||||
; FIXME: Remove waitcnt between reads
|
||||
; GCN-LABEL: {{^}}load_global_v2i16_split:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: global_load_ushort v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_load_short_d16_hi v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
|
||||
%load0 = load volatile i16, i16 addrspace(1)* %in
|
||||
%load1 = load volatile i16, i16 addrspace(1)* %gep
|
||||
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
|
||||
ret <2 x i16> %build1
|
||||
}
|
||||
|
||||
; FIXME: Remove waitcnt between reads
|
||||
; GCN-LABEL: {{^}}load_flat_v2i16_split:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: flat_load_ushort v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: flat_load_short_d16_hi v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <2 x i16> @load_flat_v2i16_split(i16 addrspace(4)* %in) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
|
||||
%load0 = load volatile i16, i16 addrspace(4)* %in
|
||||
%load1 = load volatile i16, i16 addrspace(4)* %gep
|
||||
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
|
||||
ret <2 x i16> %build1
|
||||
}
|
||||
|
||||
; FIXME: Remove waitcnt between reads
|
||||
; GCN-LABEL: {{^}}load_constant_v2i16_split:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: global_load_ushort v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_load_short_d16_hi v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <2 x i16> @load_constant_v2i16_split(i16 addrspace(2)* %in) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 1
|
||||
%load0 = load volatile i16, i16 addrspace(2)* %in
|
||||
%load1 = load volatile i16, i16 addrspace(2)* %gep
|
||||
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
|
||||
ret <2 x i16> %build1
|
||||
}
|
||||
|
||||
; FIXME: Remove m0 init and waitcnt between reads
|
||||
; FIXME: Is there a cost to using the extload over not?
|
||||
; GCN-LABEL: {{^}}load_private_v2i16_split:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s4 offen{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <2 x i16> @load_private_v2i16_split(i16* %in) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16* %in, i32 1
|
||||
%load0 = load volatile i16, i16* %in
|
||||
%load1 = load volatile i16, i16* %gep
|
||||
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
|
||||
ret <2 x i16> %build1
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
591
test/CodeGen/AMDGPU/load-lo16.ll
Normal file
591
test/CodeGen/AMDGPU/load-lo16.ll
Normal file
@ -0,0 +1,591 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: ds_read_u16_d16 v0, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_u16
|
||||
define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
|
||||
entry:
|
||||
%load = load i16, i16 addrspace(3)* %in
|
||||
%build = insertelement <2 x i16> undef, i16 %load, i32 0
|
||||
ret <2 x i16> %build
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: ds_read_u16_d16 v0, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_u16
|
||||
define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%load = load i16, i16 addrspace(3)* %in
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
|
||||
ret <2 x i16> %build1
|
||||
}
|
||||
|
||||
; Show that we get reasonable regalloc without physreg constraints.
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: ds_read_u16_d16 v0, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_u16
|
||||
define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%load = load i16, i16 addrspace(3)* %in
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_zerolo:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: ds_read_u16_d16 v1, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_u16 v
|
||||
define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
|
||||
entry:
|
||||
%load = load i16, i16 addrspace(3)* %in
|
||||
%build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
|
||||
ret <2 x i16> %build
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2f16_fpimm:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX9-NEXT: ds_read_u16_d16 v1, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_u16 v
|
||||
define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
|
||||
entry:
|
||||
%load = load half, half addrspace(3)* %in
|
||||
%build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
|
||||
ret <2 x half> %build
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2f16_reghi_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: ds_read_u16_d16 v1, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_u16 v
|
||||
define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x half>
|
||||
%load = load half, half addrspace(3)* %in
|
||||
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg:
|
||||
|
||||
; GFX9: ds_read_u16 v
|
||||
; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
|
||||
; GFX9: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
|
||||
; GFX9: global_store_dword
|
||||
|
||||
; VI: ds_read_u16 v
|
||||
define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
|
||||
entry:
|
||||
%load = load half, half addrspace(3)* %in
|
||||
%build0 = insertelement <2 x half> undef, half %reg, i32 1
|
||||
%build1 = insertelement <2 x half> %build0, half %load, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: ds_read_u8_d16 v1, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_u8 v
|
||||
define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load i8, i8 addrspace(3)* %in
|
||||
%ext = zext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9: ds_read_u8 v
|
||||
; GFX9: global_store_dword
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_u8 v
|
||||
define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%load = load i8, i8 addrspace(3)* %in
|
||||
%ext = zext i8 %load to i16
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: ds_read_i8_d16 v1, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: ds_read_i8 v
|
||||
define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load i8, i8 addrspace(3)* %in
|
||||
%ext = sext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9: ds_read_i8 v
|
||||
; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
|
||||
; GFX9: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
|
||||
|
||||
; VI: ds_read_i8 v
|
||||
define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%load = load i8, i8 addrspace(3)* %in
|
||||
%ext = sext i8 %load to i16
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
|
||||
%load = load i16, i16 addrspace(1)* %gep
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x half>
|
||||
%gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
|
||||
%load = load half, half addrspace(1)* %gep
|
||||
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
|
||||
%load = load i8, i8 addrspace(1)* %gep
|
||||
%ext = zext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
|
||||
%load = load i8, i8 addrspace(1)* %gep
|
||||
%ext = sext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_flat_lo_v2i16_reghi_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: flat_load_short_d16 v2, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: flat_load_ushort v{{[0-9]+}}
|
||||
; VI: v_or_b32_e32
|
||||
define void @load_flat_lo_v2i16_reghi_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load i16, i16 addrspace(4)* %in
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_flat_lo_v2f16_reghi_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: flat_load_short_d16 v2, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: flat_load_ushort v{{[0-9]+}}
|
||||
; VI: v_or_b32_e32
|
||||
define void @load_flat_lo_v2f16_reghi_vreg(half addrspace(4)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x half>
|
||||
%load = load half, half addrspace(4)* %in
|
||||
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: flat_load_ubyte_d16 v2, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: flat_load_ubyte v{{[0-9]+}}
|
||||
; VI: v_or_b32_e32
|
||||
define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load i8, i8 addrspace(4)* %in
|
||||
%ext = zext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: flat_load_sbyte_d16 v2, v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: flat_load_sbyte v{{[0-9]+}}
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load i8, i8 addrspace(4)* %in
|
||||
%ext = sext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg(i16* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i16, i16* %in, i64 2047
|
||||
%load = load i16, i16* %gep
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9: v_and_b32
|
||||
; GFX9: v_lshl_or_b32
|
||||
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reghi_vreg(i16* %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16* %in, i64 2047
|
||||
%load = load i16, i16* %gep
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
define void @load_private_lo_v2f16_reglo_vreg(half* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x half>
|
||||
%gep = getelementptr inbounds half, half* %in, i64 2047
|
||||
%load = load half, half* %gep
|
||||
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_nooff(i16* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load volatile i16, i16* inttoptr (i32 4094 to i16*)
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg_nooff:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reghi_vreg_nooff(i16* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load volatile i16, i16* inttoptr (i32 4094 to i16*)
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
|
||||
define void @load_private_lo_v2f16_reglo_vreg_nooff(half* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x half>
|
||||
%load = load volatile half, half* inttoptr (i32 4094 to half*)
|
||||
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ubyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 2047
|
||||
%load = load i8, i8* %gep
|
||||
%ext = zext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_sbyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 2047
|
||||
%load = load i8, i8* %gep
|
||||
%ext = sext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
|
||||
%ext = zext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
|
||||
%ext = sext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
|
||||
define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x half>
|
||||
%load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
|
||||
%ext = zext i8 %load to i16
|
||||
%bc.ext = bitcast i16 %ext to half
|
||||
%build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_constant_lo_v2i16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: flat_load_ushort
|
||||
define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(2)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047
|
||||
%load = load i16, i16 addrspace(2)* %gep
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: load_constant_lo_v2f16_reglo_vreg
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: flat_load_ushort
|
||||
define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(2)* %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x half>
|
||||
%gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047
|
||||
%load = load half, half addrspace(2)* %gep
|
||||
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
Loading…
Reference in New Issue
Block a user