mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-09 05:47:13 +00:00
[AMDGPU] Add buffer/load 8/16 bit overloaded intrinsics
Summary: Add buffer store/load 8/16 overloaded intrinsics for buffer, raw_buffer and struct_buffer Change-Id: I166a29f071b2ff4e4683fb0392564b1f223ac61d Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59265 llvm-svn: 356465
This commit is contained in:
parent
52358ec746
commit
dd846d3872
@ -840,7 +840,7 @@ let TargetPrefix = "amdgcn" in {
|
||||
defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
|
||||
|
||||
class AMDGPUBufferLoad : Intrinsic <
|
||||
[llvm_anyfloat_ty],
|
||||
[llvm_any_ty],
|
||||
[llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(SGPR/VGPR/imm)
|
||||
@ -861,7 +861,7 @@ def int_amdgcn_s_buffer_load : Intrinsic <
|
||||
|
||||
class AMDGPUBufferStore : Intrinsic <
|
||||
[],
|
||||
[llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
|
||||
[llvm_any_ty, // vdata(VGPR)
|
||||
llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(SGPR/VGPR/imm)
|
||||
|
@ -4208,10 +4208,16 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
|
||||
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
|
||||
NODE_NAME_CASE(BUFFER_LOAD)
|
||||
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
|
||||
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
|
||||
NODE_NAME_CASE(BUFFER_LOAD_BYTE)
|
||||
NODE_NAME_CASE(BUFFER_LOAD_SHORT)
|
||||
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
|
||||
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
|
||||
NODE_NAME_CASE(SBUFFER_LOAD)
|
||||
NODE_NAME_CASE(BUFFER_STORE)
|
||||
NODE_NAME_CASE(BUFFER_STORE_BYTE)
|
||||
NODE_NAME_CASE(BUFFER_STORE_SHORT)
|
||||
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
|
||||
NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
|
||||
NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
|
||||
@ -4376,6 +4382,14 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
|
||||
}
|
||||
break;
|
||||
}
|
||||
case AMDGPUISD::BUFFER_LOAD_UBYTE: {
|
||||
Known.Zero.setHighBits(24);
|
||||
break;
|
||||
}
|
||||
case AMDGPUISD::BUFFER_LOAD_USHORT: {
|
||||
Known.Zero.setHighBits(16);
|
||||
break;
|
||||
}
|
||||
case ISD::INTRINSIC_WO_CHAIN: {
|
||||
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
||||
switch (IID) {
|
||||
@ -4421,6 +4435,14 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
|
||||
case AMDGPUISD::CARRY:
|
||||
case AMDGPUISD::BORROW:
|
||||
return 31;
|
||||
case AMDGPUISD::BUFFER_LOAD_BYTE:
|
||||
return 25;
|
||||
case AMDGPUISD::BUFFER_LOAD_SHORT:
|
||||
return 17;
|
||||
case AMDGPUISD::BUFFER_LOAD_UBYTE:
|
||||
return 24;
|
||||
case AMDGPUISD::BUFFER_LOAD_USHORT:
|
||||
return 16;
|
||||
case AMDGPUISD::FP_TO_FP16:
|
||||
case AMDGPUISD::FP16_ZEXT:
|
||||
return 16;
|
||||
|
@ -491,10 +491,16 @@ enum NodeType : unsigned {
|
||||
ATOMIC_LOAD_FMIN,
|
||||
ATOMIC_LOAD_FMAX,
|
||||
BUFFER_LOAD,
|
||||
BUFFER_LOAD_UBYTE,
|
||||
BUFFER_LOAD_USHORT,
|
||||
BUFFER_LOAD_BYTE,
|
||||
BUFFER_LOAD_SHORT,
|
||||
BUFFER_LOAD_FORMAT,
|
||||
BUFFER_LOAD_FORMAT_D16,
|
||||
SBUFFER_LOAD,
|
||||
BUFFER_STORE,
|
||||
BUFFER_STORE_BYTE,
|
||||
BUFFER_STORE_SHORT,
|
||||
BUFFER_STORE_FORMAT,
|
||||
BUFFER_STORE_FORMAT_D16,
|
||||
BUFFER_ATOMIC_SWAP,
|
||||
|
@ -1132,6 +1132,10 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
|
||||
|
||||
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
|
||||
string opcode> {
|
||||
@ -1196,6 +1200,8 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// buffer_atomic patterns
|
||||
|
@ -216,11 +216,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
|
||||
|
||||
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
|
||||
|
||||
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
|
||||
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
|
||||
@ -677,6 +681,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
setTargetDAGCombine(ISD::FCANONICALIZE);
|
||||
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
|
||||
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
||||
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
|
||||
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
|
||||
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
|
||||
|
||||
@ -5581,6 +5586,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||
if (LoadVT.getScalarType() == MVT::f16)
|
||||
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
|
||||
M, DAG, Ops);
|
||||
|
||||
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
|
||||
if (LoadVT.getScalarType() == MVT::i8 ||
|
||||
LoadVT.getScalarType() == MVT::i16)
|
||||
return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
|
||||
|
||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
|
||||
M->getMemOperand());
|
||||
}
|
||||
@ -5609,6 +5620,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||
if (LoadVT.getScalarType() == MVT::f16)
|
||||
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
|
||||
M, DAG, Ops);
|
||||
|
||||
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
|
||||
if (LoadVT.getScalarType() == MVT::i8 ||
|
||||
LoadVT.getScalarType() == MVT::i16)
|
||||
return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
|
||||
|
||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
|
||||
M->getMemOperand());
|
||||
}
|
||||
@ -5637,6 +5654,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||
if (LoadVT.getScalarType() == MVT::f16)
|
||||
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
|
||||
M, DAG, Ops);
|
||||
|
||||
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
|
||||
if (LoadVT.getScalarType() == MVT::i8 ||
|
||||
LoadVT.getScalarType() == MVT::i16)
|
||||
return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
|
||||
|
||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
|
||||
M->getMemOperand());
|
||||
}
|
||||
@ -6207,6 +6230,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
||||
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
|
||||
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
|
||||
MemSDNode *M = cast<MemSDNode>(Op);
|
||||
|
||||
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
|
||||
EVT VDataType = VData.getValueType().getScalarType();
|
||||
if (VDataType == MVT::i8 || VDataType == MVT::i16)
|
||||
return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
|
||||
|
||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
||||
M->getMemoryVT(), M->getMemOperand());
|
||||
}
|
||||
@ -6233,6 +6262,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
||||
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
|
||||
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
|
||||
MemSDNode *M = cast<MemSDNode>(Op);
|
||||
|
||||
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
|
||||
EVT VDataType = VData.getValueType().getScalarType();
|
||||
if (VDataType == MVT::i8 || VDataType == MVT::i16)
|
||||
return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
|
||||
|
||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
||||
M->getMemoryVT(), M->getMemOperand());
|
||||
}
|
||||
@ -6259,6 +6294,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
||||
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
|
||||
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
|
||||
MemSDNode *M = cast<MemSDNode>(Op);
|
||||
|
||||
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
|
||||
EVT VDataType = VData.getValueType().getScalarType();
|
||||
if (VDataType == MVT::i8 || VDataType == MVT::i16)
|
||||
return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
|
||||
|
||||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
||||
M->getMemoryVT(), M->getMemOperand());
|
||||
}
|
||||
@ -6361,6 +6402,38 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
|
||||
Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
|
||||
}
|
||||
|
||||
// Handle 8 bit and 16 bit buffer loads
|
||||
SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
|
||||
EVT LoadVT, SDLoc DL,
|
||||
ArrayRef<SDValue> Ops,
|
||||
MemSDNode *M) const {
|
||||
EVT IntVT = LoadVT.changeTypeToInteger();
|
||||
unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
|
||||
AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
|
||||
|
||||
SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
|
||||
SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
|
||||
Ops, IntVT,
|
||||
M->getMemOperand());
|
||||
SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
|
||||
LoadVT.getScalarType(), BufferLoad);
|
||||
return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
|
||||
}
|
||||
|
||||
// Handle 8 bit and 16 bit buffer stores
|
||||
SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
|
||||
EVT VDataType, SDLoc DL,
|
||||
SDValue Ops[],
|
||||
MemSDNode *M) const {
|
||||
SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
|
||||
Ops[1] = BufferStoreExt;
|
||||
unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
|
||||
AMDGPUISD::BUFFER_STORE_SHORT;
|
||||
ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
|
||||
return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
|
||||
M->getMemOperand());
|
||||
}
|
||||
|
||||
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
|
||||
ISD::LoadExtType ExtType, SDValue Op,
|
||||
const SDLoc &SL, EVT VT) {
|
||||
@ -7692,6 +7765,43 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI)
|
||||
const {
|
||||
SDValue Src = N->getOperand(0);
|
||||
auto *VTSign = cast<VTSDNode>(N->getOperand(1));
|
||||
|
||||
if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
|
||||
VTSign->getVT() == MVT::i8) ||
|
||||
(Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
|
||||
VTSign->getVT() == MVT::i16)) &&
|
||||
Src.hasOneUse()) {
|
||||
auto *M = cast<MemSDNode>(Src);
|
||||
SDValue Ops[] = {
|
||||
Src.getOperand(0), // Chain
|
||||
Src.getOperand(1), // rsrc
|
||||
Src.getOperand(2), // vindex
|
||||
Src.getOperand(3), // voffset
|
||||
Src.getOperand(4), // soffset
|
||||
Src.getOperand(5), // offset
|
||||
Src.getOperand(6),
|
||||
Src.getOperand(7)
|
||||
};
|
||||
// replace with BUFFER_LOAD_BYTE/SHORT
|
||||
SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
|
||||
Src.getOperand(0).getValueType());
|
||||
unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
|
||||
AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
|
||||
SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
|
||||
ResList,
|
||||
Ops, M->getMemoryVT(),
|
||||
M->getMemOperand());
|
||||
return DCI.DAG.getMergeValues({BufferLoadSignExt,
|
||||
BufferLoadSignExt.getValue(1)}, SDLoc(N));
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performClassCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
@ -8940,7 +9050,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
|
||||
return SDValue();
|
||||
|
||||
switch (N->getOpcode()) {
|
||||
default:
|
||||
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
||||
@ -9007,6 +9116,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
||||
return performXorCombine(N, DCI);
|
||||
case ISD::ZERO_EXTEND:
|
||||
return performZeroExtendCombine(N, DCI);
|
||||
case ISD::SIGN_EXTEND_INREG:
|
||||
return performSignExtendInRegCombine(N , DCI);
|
||||
case AMDGPUISD::FP_CLASS:
|
||||
return performClassCombine(N, DCI);
|
||||
case ISD::FCANONICALIZE:
|
||||
|
@ -140,6 +140,7 @@ private:
|
||||
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
|
||||
const APFloat &C) const;
|
||||
@ -192,6 +193,15 @@ private:
|
||||
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
|
||||
SDValue *Offsets, unsigned Align = 4) const;
|
||||
|
||||
// Handle 8 bit and 16 bit buffer loads
|
||||
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
|
||||
ArrayRef<SDValue> Ops, MemSDNode *M) const;
|
||||
|
||||
// Handle 8 bit and 16 bit buffer stores
|
||||
SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType,
|
||||
SDLoc DL, SDValue Ops[],
|
||||
MemSDNode *M) const;
|
||||
|
||||
public:
|
||||
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
|
||||
|
||||
|
@ -127,6 +127,14 @@ def SDTBufferLoad : SDTypeProfile<1, 7,
|
||||
|
||||
def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
|
||||
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
|
||||
def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad,
|
||||
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
|
||||
def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad,
|
||||
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
|
||||
def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad,
|
||||
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
|
||||
def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad,
|
||||
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
|
||||
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
|
||||
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
|
||||
def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
|
||||
@ -145,6 +153,12 @@ def SDTBufferStore : SDTypeProfile<0, 8,
|
||||
|
||||
def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
|
||||
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
|
||||
def SIbuffer_store_byte: SDNode <"AMDGPUISD::BUFFER_STORE_BYTE",
|
||||
SDTBufferStore,
|
||||
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
|
||||
def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT",
|
||||
SDTBufferStore,
|
||||
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
|
||||
def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT",
|
||||
SDTBufferStore,
|
||||
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
|
||||
|
@ -257,9 +257,194 @@ main_body:
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_ubyte:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%val = uitofp i8 %tmp to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_ushort:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:16
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
|
||||
%tmp2 = zext i16 %tmp to i32
|
||||
%val = uitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_sbyte:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = sext i8 %tmp to i32
|
||||
%val = sitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_sshort:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:16
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
|
||||
%tmp2 = sext i16 %tmp to i32
|
||||
%val = sitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_ubyte_bitcast:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = zext i8 %tmp to i32
|
||||
%val = bitcast i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_ushort_bitcast:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = zext i16 %tmp to i32
|
||||
%val = bitcast i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_sbyte_bitcast:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = sext i8 %tmp to i32
|
||||
%val = bitcast i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_sshort_bitcast:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = sext i16 %tmp to i32
|
||||
%val = bitcast i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_ubyte_mul_bitcast:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = zext i8 %tmp to i32
|
||||
%tmp3 = mul i32 %tmp2, 255
|
||||
%val = bitcast i32 %tmp3 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_ushort_mul_bitcast:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = zext i16 %tmp to i32
|
||||
%tmp3 = mul i32 %tmp2, 255
|
||||
%val = bitcast i32 %tmp3 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_sbyte_mul_bitcast:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = sext i8 %tmp to i32
|
||||
%tmp3 = mul i32 %tmp2, 255
|
||||
%val = bitcast i32 %tmp3 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_sshort_mul_bitcast:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = sext i16 %tmp to i32
|
||||
%tmp3 = mul i32 %tmp2, 255
|
||||
%val = bitcast i32 %tmp3 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_load_sbyte_type_check:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
%tmp2 = zext i8 %tmp to i32
|
||||
%tmp3 = shl i32 %tmp2, 27
|
||||
%tmp4 = ashr i32 %tmp3, 27
|
||||
%val = bitcast i32 %tmp4 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
|
||||
declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
|
||||
declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
|
||||
declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) #0
|
||||
declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) #0
|
||||
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
|
||||
|
||||
attributes #0 = { nounwind readonly }
|
||||
|
@ -233,9 +233,35 @@ define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_store_byte:
|
||||
;CHECK-NOT: s_waitcnt
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8
|
||||
define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
|
||||
main_body:
|
||||
%v2 = fptoui float %v1 to i32
|
||||
%v3 = trunc i32 %v2 to i8
|
||||
call void @llvm.amdgcn.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}buffer_store_short:
|
||||
;CHECK-NOT: s_waitcnt
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16
|
||||
define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) {
|
||||
main_body:
|
||||
%v2 = fptoui float %v1 to i32
|
||||
%v3 = trunc i32 %v2 to i16
|
||||
call void @llvm.amdgcn.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
|
||||
declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
|
||||
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
|
||||
declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) #0
|
||||
declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) #0
|
||||
declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -263,6 +263,62 @@ main_body:
|
||||
ret {<4 x float>, <2 x float>, float} %r2
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}raw_buffer_load_ubyte:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0
|
||||
;CHECK: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
|
||||
%tmp2 = zext i8 %tmp to i32
|
||||
%val = uitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}raw_buffer_load_ushort:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @raw_buffer_load_ushort(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
|
||||
%tmp2 = zext i16 %tmp to i32
|
||||
%val = uitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}raw_buffer_load_sbyte:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
|
||||
%tmp2 = sext i8 %tmp to i32
|
||||
%val = sitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}raw_buffer_load_sshort:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
|
||||
%tmp2 = sext i16 %tmp to i32
|
||||
%val = sitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0
|
||||
declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0
|
||||
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
|
||||
@ -270,5 +326,7 @@ declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #0
|
||||
declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) #0
|
||||
declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
|
||||
declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0
|
||||
declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readonly }
|
||||
|
@ -189,6 +189,32 @@ main_body:
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}raw_buffer_store_byte:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
|
||||
;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0
|
||||
;CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
|
||||
main_body:
|
||||
%v2 = fptoui float %v1 to i32
|
||||
%v3 = trunc i32 %v2 to i8
|
||||
call void @llvm.amdgcn.raw.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}raw_buffer_store_short:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
|
||||
;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0
|
||||
;CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) {
|
||||
main_body:
|
||||
%v2 = fptoui float %v1 to i32
|
||||
%v3 = trunc i32 %v2 to i16
|
||||
call void @llvm.amdgcn.raw.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0
|
||||
@ -196,6 +222,8 @@ declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0
|
||||
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1
|
||||
declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readonly }
|
||||
|
@ -144,6 +144,62 @@ main_body:
|
||||
ret {<4 x float>, <2 x float>, float} %r2
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}struct_buffer_load_ubyte:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
|
||||
;CHECK: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
|
||||
%tmp2 = zext i8 %tmp to i32
|
||||
%val = uitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}struct_buffer_load_ushort:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
|
||||
%tmp2 = zext i16 %tmp to i32
|
||||
%val = uitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}struct_buffer_load_sbyte:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
|
||||
main_body:
|
||||
%tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
|
||||
%tmp2 = sext i8 %tmp to i32
|
||||
%val = sitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}struct_buffer_load_sshort:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen
|
||||
;CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
|
||||
;CHECK-NEXT: ; return to shader part epilog
|
||||
define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
|
||||
main_body:
|
||||
%tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
|
||||
%tmp2 = sext i16 %tmp to i32
|
||||
%val = sitofp i32 %tmp2 to float
|
||||
ret float %val
|
||||
}
|
||||
|
||||
declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0
|
||||
declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0
|
||||
declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0
|
||||
@ -151,5 +207,7 @@ declare i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32>, i32, i32, i32, i32) #
|
||||
declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) #0
|
||||
declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
|
||||
declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #0
|
||||
declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readonly }
|
||||
|
@ -108,6 +108,32 @@ main_body:
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}struct_buffer_store_byte:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
|
||||
;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
|
||||
;CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
|
||||
main_body:
|
||||
%v2 = fptoui float %v1 to i32
|
||||
%v3 = trunc i32 %v2 to i8
|
||||
call void @llvm.amdgcn.struct.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
;CHECK-LABEL: {{^}}struct_buffer_store_short:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
|
||||
;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
|
||||
;CHECK-NEXT: s_endpgm
|
||||
define amdgpu_ps void @struct_buffer_store_short(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
|
||||
main_body:
|
||||
%v2 = fptoui float %v1 to i32
|
||||
%v3 = trunc i32 %v2 to i16
|
||||
call void @llvm.amdgcn.struct.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
|
||||
@ -115,6 +141,8 @@ declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32,
|
||||
declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0
|
||||
declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1
|
||||
declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) #0
|
||||
declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readonly }
|
||||
|
Loading…
x
Reference in New Issue
Block a user