mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-13 16:03:58 +00:00
R600: Add support for v4i32 and v2i32 local stores
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189222 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
da25cd3e6d
commit
7a0282daeb
@ -67,6 +67,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
|
||||
setOperationAction(ISD::STORE, MVT::f64, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
|
||||
|
||||
// Custom lowering of vector stores is required for local address space
|
||||
// stores.
|
||||
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
|
||||
// XXX: Native v2i32 local address space stores are possible, but not
|
||||
// currently implemented.
|
||||
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
|
||||
|
||||
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
|
||||
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
|
||||
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
|
||||
@ -221,7 +228,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
|
||||
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
|
||||
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
|
||||
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
||||
case ISD::STORE: return LowerVectorStore(Op, DAG);
|
||||
case ISD::STORE: return LowerSTORE(Op, DAG);
|
||||
case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
|
||||
}
|
||||
return Op;
|
||||
@ -417,7 +424,98 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
|
||||
return Op;
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
|
||||
SelectionDAG &DAG) const {
|
||||
StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
|
||||
EVT MemVT = Store->getMemoryVT();
|
||||
unsigned MemBits = MemVT.getSizeInBits();
|
||||
|
||||
// Byte stores are really expensive, so if possible, try to pack
|
||||
// 32-bit vector truncatating store into an i32 store.
|
||||
// XXX: We could also handle optimize other vector bitwidths
|
||||
if (!MemVT.isVector() || MemBits > 32) {
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDLoc DL(Op);
|
||||
const SDValue &Value = Store->getValue();
|
||||
EVT VT = Value.getValueType();
|
||||
const SDValue &Ptr = Store->getBasePtr();
|
||||
EVT MemEltVT = MemVT.getVectorElementType();
|
||||
unsigned MemEltBits = MemEltVT.getSizeInBits();
|
||||
unsigned MemNumElements = MemVT.getVectorNumElements();
|
||||
EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
|
||||
SDValue Mask;
|
||||
switch(MemEltBits) {
|
||||
case 8:
|
||||
Mask = DAG.getConstant(0xFF, PackedVT);
|
||||
break;
|
||||
case 16:
|
||||
Mask = DAG.getConstant(0xFFFF, PackedVT);
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("Cannot lower this vector store");
|
||||
}
|
||||
SDValue PackedValue;
|
||||
for (unsigned i = 0; i < MemNumElements; ++i) {
|
||||
EVT ElemVT = VT.getVectorElementType();
|
||||
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
|
||||
DAG.getConstant(i, MVT::i32));
|
||||
Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
|
||||
Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
|
||||
SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
|
||||
Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
|
||||
if (i == 0) {
|
||||
PackedValue = Elt;
|
||||
} else {
|
||||
PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
|
||||
}
|
||||
}
|
||||
return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
|
||||
MachinePointerInfo(Store->getMemOperand()->getValue()),
|
||||
Store->isVolatile(), Store->isNonTemporal(),
|
||||
Store->getAlignment());
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
StoreSDNode *Store = cast<StoreSDNode>(Op);
|
||||
EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
|
||||
EVT EltVT = Store->getValue().getValueType().getVectorElementType();
|
||||
EVT PtrVT = Store->getBasePtr().getValueType();
|
||||
unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
|
||||
SDLoc SL(Op);
|
||||
|
||||
SmallVector<SDValue, 8> Chains;
|
||||
|
||||
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
||||
SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
|
||||
Store->getValue(), DAG.getConstant(i, MVT::i32));
|
||||
SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
|
||||
Store->getBasePtr(),
|
||||
DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
|
||||
PtrVT));
|
||||
Chains.push_back(DAG.getStore(Store->getChain(), SL, Val, Ptr,
|
||||
MachinePointerInfo(Store->getMemOperand()->getValue()),
|
||||
Store->isVolatile(), Store->isNonTemporal(),
|
||||
Store->getAlignment()));
|
||||
}
|
||||
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
|
||||
if (Result.getNode()) {
|
||||
return Result;
|
||||
}
|
||||
|
||||
StoreSDNode *Store = cast<StoreSDNode>(Op);
|
||||
if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
|
||||
Store->getValue().getValueType().isVector()) {
|
||||
return SplitVectorStore(Op, DAG);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
@ -524,58 +622,6 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
|
||||
return DAG.getMergeValues(Ops, 2, DL);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerVectorStore(const SDValue &Op,
|
||||
SelectionDAG &DAG) const {
|
||||
StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
|
||||
EVT MemVT = Store->getMemoryVT();
|
||||
unsigned MemBits = MemVT.getSizeInBits();
|
||||
|
||||
// Byte stores are really expensive, so if possible, try to pack
|
||||
// 32-bit vector truncatating store into an i32 store.
|
||||
// XXX: We could also handle optimize other vector bitwidths
|
||||
if (!MemVT.isVector() || MemBits > 32) {
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDLoc DL(Op);
|
||||
const SDValue &Value = Store->getValue();
|
||||
EVT VT = Value.getValueType();
|
||||
const SDValue &Ptr = Store->getBasePtr();
|
||||
EVT MemEltVT = MemVT.getVectorElementType();
|
||||
unsigned MemEltBits = MemEltVT.getSizeInBits();
|
||||
unsigned MemNumElements = MemVT.getVectorNumElements();
|
||||
EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
|
||||
SDValue Mask;
|
||||
switch(MemEltBits) {
|
||||
case 8:
|
||||
Mask = DAG.getConstant(0xFF, PackedVT);
|
||||
break;
|
||||
case 16:
|
||||
Mask = DAG.getConstant(0xFFFF, PackedVT);
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("Cannot lower this vector store");
|
||||
}
|
||||
SDValue PackedValue;
|
||||
for (unsigned i = 0; i < MemNumElements; ++i) {
|
||||
EVT ElemVT = VT.getVectorElementType();
|
||||
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
|
||||
DAG.getConstant(i, MVT::i32));
|
||||
Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
|
||||
Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
|
||||
SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
|
||||
Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
|
||||
if (i == 0) {
|
||||
PackedValue = Elt;
|
||||
} else {
|
||||
PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
|
||||
}
|
||||
}
|
||||
return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
|
||||
MachinePointerInfo(Store->getMemOperand()->getValue()),
|
||||
Store->isVolatile(), Store->isNonTemporal(),
|
||||
Store->getAlignment());
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Helper functions
|
||||
|
@ -31,6 +31,12 @@ private:
|
||||
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
|
||||
/// \brief Lower vector stores by merging the vector elements into an integer
|
||||
/// of the same bitwidth.
|
||||
SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
|
||||
/// \brief Split a vector store into multiple scalar stores.
|
||||
/// \returns The resulting chain.
|
||||
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
protected:
|
||||
@ -44,17 +50,13 @@ protected:
|
||||
unsigned Reg, EVT VT) const;
|
||||
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
|
||||
SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
|
||||
bool isHWTrueValue(SDValue Op) const;
|
||||
bool isHWFalseValue(SDValue Op) const;
|
||||
|
||||
void AnalyzeFormalArguments(CCState &State,
|
||||
const SmallVectorImpl<ISD::InputArg> &Ins) const;
|
||||
|
||||
/// \brief Lower vector stores by merging the vector elements into an integer
|
||||
/// of the same bitwidth.
|
||||
SDValue LowerVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
|
||||
|
||||
public:
|
||||
AMDGPUTargetLowering(TargetMachine &TM);
|
||||
|
||||
|
@ -1002,7 +1002,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue Value = Op.getOperand(1);
|
||||
SDValue Ptr = Op.getOperand(2);
|
||||
|
||||
SDValue Result = AMDGPUTargetLowering::LowerVectorStore(Op, DAG);
|
||||
SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
|
||||
if (Result.getNode()) {
|
||||
return Result;
|
||||
}
|
||||
|
@ -168,6 +168,58 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
;===------------------------------------------------------------------------===;
|
||||
; Local Address Space
|
||||
;===------------------------------------------------------------------------===;
|
||||
|
||||
; EG-CHECK: @store_local_v2i16
|
||||
; EG-CHECK: LDS_WRITE
|
||||
; CM-CHECK: @store_local_v2i16
|
||||
; CM-CHECK: LDS_WRITE
|
||||
; SI-CHECK: @store_local_v2i16
|
||||
; SI-CHECK: DS_WRITE_B32
|
||||
define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
|
||||
entry:
|
||||
store <2 x i16> %in, <2 x i16> addrspace(3)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; EG-CHECK: @store_local_v2i32
|
||||
; EG-CHECK: LDS_WRITE
|
||||
; EG-CHECK: LDS_WRITE
|
||||
; CM-CHECK: @store_local_v2i32
|
||||
; CM-CHECK: LDS_WRITE
|
||||
; CM-CHECK: LDS_WRITE
|
||||
; SI-CHECK: @store_local_v2i32
|
||||
; SI-CHECK: DS_WRITE_B32
|
||||
; SI-CHECK: DS_WRITE_B32
|
||||
define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
|
||||
entry:
|
||||
store <2 x i32> %in, <2 x i32> addrspace(3)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; EG-CHECK: @store_local_v4i32
|
||||
; EG-CHECK: LDS_WRITE
|
||||
; EG-CHECK: LDS_WRITE
|
||||
; EG-CHECK: LDS_WRITE
|
||||
; EG-CHECK: LDS_WRITE
|
||||
; CM-CHECK: @store_local_v4i32
|
||||
; CM-CHECK: LDS_WRITE
|
||||
; CM-CHECK: LDS_WRITE
|
||||
; CM-CHECK: LDS_WRITE
|
||||
; CM-CHECK: LDS_WRITE
|
||||
; SI-CHECK: @store_local_v4i32
|
||||
; SI-CHECK: DS_WRITE_B32
|
||||
; SI-CHECK: DS_WRITE_B32
|
||||
; SI-CHECK: DS_WRITE_B32
|
||||
; SI-CHECK: DS_WRITE_B32
|
||||
define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
|
||||
entry:
|
||||
store <4 x i32> %in, <4 x i32> addrspace(3)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; The stores in this function are combined by the optimizer to create a
|
||||
; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer
|
||||
; should not try to split the 64-bit store back into 2 32-bit stores.
|
||||
|
Loading…
x
Reference in New Issue
Block a user