mirror of
https://github.com/RPCSX/llvm.git
synced 2025-02-03 19:15:30 +00:00
R600/SI: Remove explicit m0 operand from DS instructions
Instead add m0 as an implicit operand. This helps avoid spills of the m0 register in some cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237141 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
a89c1839c7
commit
6ecd744594
@ -78,6 +78,8 @@ private:
|
||||
bool isLocalLoad(const LoadSDNode *N) const;
|
||||
bool isRegionLoad(const LoadSDNode *N) const;
|
||||
|
||||
SDNode *glueCopyToM0(SDNode *N) const;
|
||||
|
||||
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
|
||||
bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
|
||||
bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
|
||||
@ -242,6 +244,32 @@ bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
|
||||
if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
|
||||
!checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(),
|
||||
AMDGPUAS::LOCAL_ADDRESS))
|
||||
return N;
|
||||
|
||||
const SITargetLowering& Lowering =
|
||||
*static_cast<const SITargetLowering*>(getTargetLowering());
|
||||
|
||||
// Write max value to m0 before each load operation
|
||||
|
||||
SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
|
||||
CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
|
||||
|
||||
SDValue Glue = M0.getValue(1);
|
||||
|
||||
SmallVector <SDValue, 8> Ops;
|
||||
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
|
||||
Ops.push_back(N->getOperand(i));
|
||||
}
|
||||
Ops.push_back(Glue);
|
||||
CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
|
||||
|
||||
return N;
|
||||
}
|
||||
|
||||
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||
unsigned int Opc = N->getOpcode();
|
||||
if (N->isMachineOpcode()) {
|
||||
@ -249,6 +277,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||
return nullptr; // Already selected.
|
||||
}
|
||||
|
||||
if (isa<AtomicSDNode>(N))
|
||||
N = glueCopyToM0(N);
|
||||
|
||||
switch (Opc) {
|
||||
default: break;
|
||||
// We are selecting i64 ADD here instead of custom lower it during
|
||||
@ -423,23 +454,29 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||
}
|
||||
|
||||
case ISD::LOAD: {
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N);
|
||||
SDLoc SL(N);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
|
||||
N = glueCopyToM0(N);
|
||||
break;
|
||||
}
|
||||
|
||||
// To simplify the TableGen patters, we replace all i64 loads with
|
||||
// v2i32 loads. Alternatively, we could promote i64 loads to v2i32
|
||||
// during DAG legalization, however, so places (ExpandUnalignedLoad)
|
||||
// in the DAG legalizer assume that if i64 is legal, so doing this
|
||||
// promotion early can cause problems.
|
||||
EVT VT = N->getValueType(0);
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N);
|
||||
if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
|
||||
break;
|
||||
|
||||
SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
|
||||
LD->getBasePtr(), LD->getMemOperand());
|
||||
SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
|
||||
LD->getBasePtr(), LD->getMemOperand());
|
||||
SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
|
||||
MVT::i64, NewLoad);
|
||||
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
|
||||
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
|
||||
SelectCode(NewLoad.getNode());
|
||||
SDNode *Load = glueCopyToM0(NewLoad.getNode());
|
||||
SelectCode(Load);
|
||||
N = BitCast.getNode();
|
||||
break;
|
||||
}
|
||||
@ -448,24 +485,26 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||
// Handle i64 stores here for the same reason mentioned above for loads.
|
||||
StoreSDNode *ST = cast<StoreSDNode>(N);
|
||||
SDValue Value = ST->getValue();
|
||||
if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
|
||||
break;
|
||||
if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
|
||||
|
||||
SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
|
||||
MVT::v2i32, Value);
|
||||
SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
|
||||
ST->getBasePtr(), ST->getMemOperand());
|
||||
SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
|
||||
MVT::v2i32, Value);
|
||||
SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
|
||||
ST->getBasePtr(), ST->getMemOperand());
|
||||
|
||||
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
|
||||
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
|
||||
|
||||
if (NewValue.getOpcode() == ISD::BITCAST) {
|
||||
Select(NewStore.getNode());
|
||||
return SelectCode(NewValue.getNode());
|
||||
if (NewValue.getOpcode() == ISD::BITCAST) {
|
||||
Select(NewStore.getNode());
|
||||
return SelectCode(NewValue.getNode());
|
||||
}
|
||||
|
||||
// getNode() may fold the bitcast if its input was another bitcast. If that
|
||||
// happens we should only select the new store.
|
||||
N = NewStore.getNode();
|
||||
}
|
||||
|
||||
// getNode() may fold the bitcast if its input was another bitcast. If that
|
||||
// happens we should only select the new store.
|
||||
N = NewStore.getNode();
|
||||
N = glueCopyToM0(N);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -183,12 +183,15 @@ def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
||||
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
|
||||
}]>;
|
||||
|
||||
def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
|
||||
class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
|
||||
(ld_node node:$ptr), [{
|
||||
LoadSDNode *L = cast<LoadSDNode>(N);
|
||||
return L->getExtensionType() == ISD::ZEXTLOAD ||
|
||||
L->getExtensionType() == ISD::EXTLOAD;
|
||||
}]>;
|
||||
|
||||
def az_extload : AZExtLoadBase <unindexedload>;
|
||||
|
||||
def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
|
||||
return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
|
||||
}]>;
|
||||
@ -361,22 +364,26 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr),
|
||||
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
|
||||
}]>;
|
||||
|
||||
multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
|
||||
|
||||
def atomic_cmp_swap_32_local :
|
||||
PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
|
||||
(atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
|
||||
AtomicSDNode *AN = cast<AtomicSDNode>(N);
|
||||
return AN->getMemoryVT() == MVT::i32 &&
|
||||
AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
|
||||
}]>;
|
||||
def _32_local : PatFrag <
|
||||
(ops node:$ptr, node:$cmp, node:$swap),
|
||||
(cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
|
||||
AtomicSDNode *AN = cast<AtomicSDNode>(N);
|
||||
return AN->getMemoryVT() == MVT::i32 &&
|
||||
AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
|
||||
}]>;
|
||||
|
||||
def atomic_cmp_swap_64_local :
|
||||
PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
|
||||
(atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
|
||||
AtomicSDNode *AN = cast<AtomicSDNode>(N);
|
||||
return AN->getMemoryVT() == MVT::i64 &&
|
||||
AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
|
||||
}]>;
|
||||
def _64_local : PatFrag<
|
||||
(ops node:$ptr, node:$cmp, node:$swap),
|
||||
(cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
|
||||
AtomicSDNode *AN = cast<AtomicSDNode>(N);
|
||||
return AN->getMemoryVT() == MVT::i64 &&
|
||||
AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
|
||||
}]>;
|
||||
}
|
||||
|
||||
defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
|
||||
|
||||
def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
||||
return isFlatLoad(dyn_cast<LoadSDNode>(N));
|
||||
|
@ -604,7 +604,7 @@ class DS <dag outs, dag ins, string asm, list<dag> pattern> :
|
||||
let LGKM_CNT = 1;
|
||||
let DS = 1;
|
||||
let UseNamedOperandTable = 1;
|
||||
let DisableEncoding = "$m0";
|
||||
let Uses = [M0];
|
||||
|
||||
// Most instruction load and store data, so set this as the default.
|
||||
let mayLoad = 1;
|
||||
|
@ -124,6 +124,107 @@ def SIconstdata_ptr : SDNode<
|
||||
"AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
|
||||
// to be glued to the memory instructions.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
|
||||
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
||||
>;
|
||||
|
||||
def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
|
||||
return isLocalLoad(cast<LoadSDNode>(N));
|
||||
}]>;
|
||||
|
||||
def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
|
||||
return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
|
||||
cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
|
||||
}]>;
|
||||
|
||||
def si_load_local_align8 : Aligned8Bytes <
|
||||
(ops node:$ptr), (si_load_local node:$ptr)
|
||||
>;
|
||||
|
||||
def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
|
||||
return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
|
||||
}]>;
|
||||
def si_az_extload_local : AZExtLoadBase <si_ld_local>;
|
||||
|
||||
multiclass SIExtLoadLocal <PatFrag ld_node> {
|
||||
|
||||
def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
|
||||
[{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}]
|
||||
>;
|
||||
|
||||
def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
|
||||
[{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}]
|
||||
>;
|
||||
}
|
||||
|
||||
defm si_sextload_local : SIExtLoadLocal <si_sextload_local>;
|
||||
defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>;
|
||||
|
||||
def SIst_local : SDNode <"ISD::STORE", SDTStore,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
|
||||
>;
|
||||
|
||||
def si_st_local : PatFrag <
|
||||
(ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
|
||||
return isLocalStore(cast<StoreSDNode>(N));
|
||||
}]>;
|
||||
|
||||
def si_store_local : PatFrag <
|
||||
(ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
|
||||
return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
|
||||
!cast<StoreSDNode>(N)->isTruncatingStore();
|
||||
}]>;
|
||||
|
||||
def si_store_local_align8 : Aligned8Bytes <
|
||||
(ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr)
|
||||
>;
|
||||
|
||||
def si_truncstore_local : PatFrag <
|
||||
(ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
|
||||
return cast<StoreSDNode>(N)->isTruncatingStore();
|
||||
}]>;
|
||||
|
||||
def si_truncstore_local_i8 : PatFrag <
|
||||
(ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
|
||||
return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
|
||||
}]>;
|
||||
|
||||
def si_truncstore_local_i16 : PatFrag <
|
||||
(ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
|
||||
return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
|
||||
}]>;
|
||||
|
||||
multiclass SIAtomicM0Glue2 <string op_name> {
|
||||
|
||||
def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
||||
>;
|
||||
|
||||
def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
|
||||
}
|
||||
|
||||
defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
|
||||
defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
|
||||
defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
|
||||
defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
|
||||
defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
|
||||
defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
|
||||
defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
|
||||
defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
|
||||
defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
|
||||
defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">;
|
||||
|
||||
def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
||||
>;
|
||||
|
||||
defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>;
|
||||
|
||||
// Transformation function, extract the lower 32bit of a 64bit immediate
|
||||
def LO32 : SDNodeXForm<imm, [{
|
||||
return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N),
|
||||
@ -1726,7 +1827,7 @@ class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm
|
||||
|
||||
multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
|
||||
dag outs = (outs rc:$vdst),
|
||||
dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds, M0Reg:$m0),
|
||||
dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
|
||||
string asm = opName#" $vdst, $addr"#"$offset$gds"> {
|
||||
|
||||
def "" : DS_Pseudo <opName, outs, ins, []>;
|
||||
@ -1740,7 +1841,7 @@ multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
|
||||
multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
|
||||
dag outs = (outs rc:$vdst),
|
||||
dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
|
||||
gds01:$gds, M0Reg:$m0),
|
||||
gds01:$gds),
|
||||
string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> {
|
||||
|
||||
def "" : DS_Pseudo <opName, outs, ins, []>;
|
||||
@ -1753,8 +1854,7 @@ multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
|
||||
|
||||
multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
|
||||
dag outs = (outs),
|
||||
dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds,
|
||||
M0Reg:$m0),
|
||||
dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
|
||||
string asm = opName#" $addr, $data0"#"$offset$gds"> {
|
||||
|
||||
def "" : DS_Pseudo <opName, outs, ins, []>,
|
||||
@ -1769,7 +1869,7 @@ multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
|
||||
multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
|
||||
dag outs = (outs),
|
||||
dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
|
||||
ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds, M0Reg:$m0),
|
||||
ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds),
|
||||
string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> {
|
||||
|
||||
def "" : DS_Pseudo <opName, outs, ins, []>;
|
||||
@ -1783,8 +1883,7 @@ multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
|
||||
multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
|
||||
string noRetOp = "",
|
||||
dag outs = (outs rc:$vdst),
|
||||
dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds,
|
||||
M0Reg:$m0),
|
||||
dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
|
||||
string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
|
||||
|
||||
def "" : DS_Pseudo <opName, outs, ins, []>,
|
||||
@ -1812,14 +1911,14 @@ multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
|
||||
string noRetOp = "", RegisterClass src = rc> :
|
||||
DS_1A2D_RET_m <op, asm, rc, noRetOp,
|
||||
(ins VGPR_32:$addr, src:$data0, src:$data1,
|
||||
ds_offset:$offset, gds:$gds, M0Reg:$m0)
|
||||
ds_offset:$offset, gds:$gds)
|
||||
>;
|
||||
|
||||
multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
|
||||
string noRetOp = opName,
|
||||
dag outs = (outs),
|
||||
dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
|
||||
ds_offset:$offset, gds:$gds, M0Reg:$m0),
|
||||
ds_offset:$offset, gds:$gds),
|
||||
string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> {
|
||||
|
||||
def "" : DS_Pseudo <opName, outs, ins, []>,
|
||||
@ -1833,7 +1932,7 @@ multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
|
||||
|
||||
multiclass DS_0A_RET <bits<8> op, string opName,
|
||||
dag outs = (outs VGPR_32:$vdst),
|
||||
dag ins = (ins ds_offset:$offset, gds:$gds, M0Reg:$m0),
|
||||
dag ins = (ins ds_offset:$offset, gds:$gds),
|
||||
string asm = opName#" $vdst"#"$offset"#"$gds"> {
|
||||
|
||||
let mayLoad = 1, mayStore = 1 in {
|
||||
@ -1848,7 +1947,7 @@ multiclass DS_0A_RET <bits<8> op, string opName,
|
||||
|
||||
multiclass DS_1A_RET_GDS <bits<8> op, string opName,
|
||||
dag outs = (outs VGPR_32:$vdst),
|
||||
dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset, M0Reg:$m0),
|
||||
dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset),
|
||||
string asm = opName#" $vdst, $addr"#"$offset gds"> {
|
||||
|
||||
def "" : DS_Pseudo <opName, outs, ins, []>;
|
||||
@ -1861,7 +1960,7 @@ multiclass DS_1A_RET_GDS <bits<8> op, string opName,
|
||||
|
||||
multiclass DS_1A_GDS <bits<8> op, string opName,
|
||||
dag outs = (outs),
|
||||
dag ins = (ins VGPR_32:$addr, M0Reg:$m0),
|
||||
dag ins = (ins VGPR_32:$addr),
|
||||
string asm = opName#" $addr gds"> {
|
||||
|
||||
def "" : DS_Pseudo <opName, outs, ins, []>;
|
||||
@ -1874,7 +1973,7 @@ multiclass DS_1A_GDS <bits<8> op, string opName,
|
||||
|
||||
multiclass DS_1A <bits<8> op, string opName,
|
||||
dag outs = (outs),
|
||||
dag ins = (ins VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0, gds:$gds),
|
||||
dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
|
||||
string asm = opName#" $addr"#"$offset"#"$gds"> {
|
||||
|
||||
let mayLoad = 1, mayStore = 1 in {
|
||||
|
@ -2824,52 +2824,52 @@ def : ROTRPattern <V_ALIGNBIT_B32>;
|
||||
|
||||
class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
|
||||
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
|
||||
(inst $ptr, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
|
||||
(inst $ptr, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>;
|
||||
def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>;
|
||||
def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
|
||||
def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
|
||||
def : DSReadPat <DS_READ_B32, i32, local_load>;
|
||||
def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>;
|
||||
def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>;
|
||||
def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
|
||||
def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
|
||||
def : DSReadPat <DS_READ_B32, i32, si_load_local>;
|
||||
|
||||
let AddedComplexity = 100 in {
|
||||
|
||||
def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>;
|
||||
def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
|
||||
|
||||
} // End AddedComplexity = 100
|
||||
|
||||
def : Pat <
|
||||
(v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
|
||||
(v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
|
||||
i8:$offset1))),
|
||||
(DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0), (S_MOV_B32 -1))
|
||||
(DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
|
||||
>;
|
||||
|
||||
class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
|
||||
(frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
|
||||
(inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
|
||||
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
|
||||
def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
|
||||
def : DSWritePat <DS_WRITE_B32, i32, local_store>;
|
||||
def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
|
||||
def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
|
||||
def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
|
||||
|
||||
let AddedComplexity = 100 in {
|
||||
|
||||
def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>;
|
||||
def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
|
||||
} // End AddedComplexity = 100
|
||||
|
||||
def : Pat <
|
||||
(local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
|
||||
i8:$offset1)),
|
||||
(si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
|
||||
i8:$offset1)),
|
||||
(DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
|
||||
(EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
|
||||
(i1 0), (S_MOV_B32 -1))
|
||||
(i1 0))
|
||||
>;
|
||||
|
||||
class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
|
||||
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
|
||||
(inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
|
||||
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
|
||||
@ -2885,53 +2885,53 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
|
||||
class DSAtomicIncRetPat<DS inst, ValueType vt,
|
||||
Instruction LoadImm, PatFrag frag> : Pat <
|
||||
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
|
||||
(inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
|
||||
(inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
|
||||
class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
|
||||
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
|
||||
(inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1))
|
||||
(inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
|
||||
// 32-bit atomics.
|
||||
def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
|
||||
S_MOV_B32, atomic_load_add_local>;
|
||||
S_MOV_B32, si_atomic_load_add_local>;
|
||||
def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
|
||||
S_MOV_B32, atomic_load_sub_local>;
|
||||
S_MOV_B32, si_atomic_load_sub_local>;
|
||||
|
||||
def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
|
||||
def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
|
||||
def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
|
||||
def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
|
||||
def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
|
||||
def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
|
||||
def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
|
||||
def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
|
||||
def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
|
||||
def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
|
||||
def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
|
||||
def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
|
||||
|
||||
def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
|
||||
def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
|
||||
|
||||
// 64-bit atomics.
|
||||
def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
|
||||
S_MOV_B64, atomic_load_add_local>;
|
||||
S_MOV_B64, si_atomic_load_add_local>;
|
||||
def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
|
||||
S_MOV_B64, atomic_load_sub_local>;
|
||||
S_MOV_B64, si_atomic_load_sub_local>;
|
||||
|
||||
def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
|
||||
def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
|
||||
def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
|
||||
def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
|
||||
def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
|
||||
def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
|
||||
def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
|
||||
def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
|
||||
def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
|
||||
def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
|
||||
def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
|
||||
def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
|
||||
def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
|
||||
def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
|
||||
|
||||
def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
|
||||
def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -213,7 +213,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
||||
// Be careful, since the addresses could be subregisters themselves in weird
|
||||
// cases, like vectors of pointers.
|
||||
const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
|
||||
const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
|
||||
|
||||
unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
|
||||
unsigned DestReg1
|
||||
@ -254,37 +253,24 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
||||
.addImm(NewOffset0) // offset0
|
||||
.addImm(NewOffset1) // offset1
|
||||
.addImm(0) // gds
|
||||
.addOperand(*M0Reg) // M0
|
||||
.addMemOperand(*I->memoperands_begin())
|
||||
.addMemOperand(*Paired->memoperands_begin());
|
||||
|
||||
LIS->InsertMachineInstrInMaps(Read2);
|
||||
|
||||
unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
|
||||
unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
|
||||
updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
|
||||
updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
|
||||
|
||||
LIS->RemoveMachineInstrFromMaps(I);
|
||||
LIS->RemoveMachineInstrFromMaps(Paired);
|
||||
// Replacing Paired in the maps with Read2 allows us to avoid updating the
|
||||
// live range for the m0 register.
|
||||
LIS->ReplaceMachineInstrInMaps(Paired, Read2);
|
||||
I->eraseFromParent();
|
||||
Paired->eraseFromParent();
|
||||
|
||||
LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
|
||||
LIS->shrinkToUses(&AddrRegLI);
|
||||
|
||||
LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg());
|
||||
LIS->shrinkToUses(&M0RegLI);
|
||||
|
||||
// Currently m0 is treated as a register class with one member instead of an
|
||||
// implicit physical register. We are using the virtual register for the first
|
||||
// one, but we still need to update the live range of the now unused second m0
|
||||
// virtual register to avoid verifier errors.
|
||||
const MachineOperand *PairedM0Reg
|
||||
= TII->getNamedOperand(*Paired, AMDGPU::OpName::m0);
|
||||
LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg());
|
||||
LIS->shrinkToUses(&PairedM0RegLI);
|
||||
|
||||
LIS->getInterval(DestReg); // Create new LI
|
||||
|
||||
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
|
||||
@ -300,7 +286,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
||||
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
|
||||
// sure we preserve the subregister index and any register flags set on them.
|
||||
const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
|
||||
const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
|
||||
const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
|
||||
const MachineOperand *Data1
|
||||
= TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
|
||||
@ -331,6 +316,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
||||
const MCInstrDesc &Write2Desc = TII->get(Opc);
|
||||
DebugLoc DL = I->getDebugLoc();
|
||||
|
||||
// repairLiveintervalsInRange() doesn't handle physical register, so we have
|
||||
// to update the M0 range manually.
|
||||
SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
|
||||
LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
|
||||
LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
|
||||
bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
|
||||
|
||||
MachineInstrBuilder Write2
|
||||
= BuildMI(*MBB, I, DL, Write2Desc)
|
||||
.addOperand(*Addr) // addr
|
||||
@ -339,21 +331,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
||||
.addImm(NewOffset0) // offset0
|
||||
.addImm(NewOffset1) // offset1
|
||||
.addImm(0) // gds
|
||||
.addOperand(*M0Reg) // m0
|
||||
.addMemOperand(*I->memoperands_begin())
|
||||
.addMemOperand(*Paired->memoperands_begin());
|
||||
|
||||
// XXX - How do we express subregisters here?
|
||||
unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(),
|
||||
M0Reg->getReg()};
|
||||
unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
|
||||
|
||||
LIS->RemoveMachineInstrFromMaps(I);
|
||||
LIS->RemoveMachineInstrFromMaps(Paired);
|
||||
I->eraseFromParent();
|
||||
Paired->eraseFromParent();
|
||||
|
||||
// This doesn't handle physical registers like M0
|
||||
LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
|
||||
|
||||
if (UpdateM0Range) {
|
||||
SlotIndex Write2Index = LIS->getInstructionIndex(Write2);
|
||||
M0Segment->end = Write2Index.getRegSlot();
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
|
||||
return Write2.getInstr();
|
||||
}
|
||||
|
@ -65,8 +65,8 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f32_over_max_offset
|
||||
; SI-NOT: ds_read2st64_b32
|
||||
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
|
||||
; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
|
||||
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
|
||||
; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
|
||||
; SI: s_endpgm
|
||||
define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
|
||||
|
@ -69,8 +69,8 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
|
||||
; pointer can be used with an offset into the second one.
|
||||
|
||||
; SI-LABEL: {{^}}load_shl_base_lds_2:
|
||||
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; SI: s_mov_b32 m0, -1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
|
||||
; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
|
||||
; SI: s_endpgm
|
||||
define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
|
||||
|
Loading…
x
Reference in New Issue
Block a user