[Scheduling] Implement a new way to cluster loads/stores

Before calling target hook to determine if two loads/stores are clusterable,
we put them into different groups to avoid fake cluster due to dependency.
For now, we are putting the loads/stores into the same group if they have
the same predecessor. We assume that, if two loads/stores have the same
predecessor, it is likely that, they didn't have dependency for each other.

However, one SUnit might have several predecessors and for now, we just
pick up the first predecessor that has non-data/non-artificial dependency,
which is too arbitrary. And we are struggling to fix it.

So, I am proposing some better implementation.
1. Collect all the loads/stores that has memory info first to reduce the complexity.
2. Sort these loads/stores so that we can stop the seeking as early as possible.
3. For each load/store, seeking for the first non-dependency instruction with the
   sorted order, and check if they can cluster or not.

Reviewed By: Jay Foad

Differential Revision: https://reviews.llvm.org/D85517
This commit is contained in:
QingShan Zhang 2020-08-26 12:26:21 +00:00
parent 70f1e167ac
commit 48664955df
6 changed files with 91 additions and 80 deletions

View File

@ -268,6 +268,11 @@ namespace llvm {
return SU->SchedClass;
}
/// IsReachable - Checks if SU is reachable from TargetSU.
bool IsReachable(SUnit *SU, SUnit *TargetSU) {
return Topo.IsReachable(SU, TargetSU);
}
/// Returns an iterator to the top of the current scheduling region.
MachineBasicBlock::iterator begin() const { return RegionBegin; }

View File

@ -1530,7 +1530,10 @@ public:
void apply(ScheduleDAGInstrs *DAGInstrs) override;
protected:
void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG);
void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps,
ScheduleDAGInstrs *DAG);
void collectMemOpRecords(std::vector<SUnit> &SUnits,
SmallVectorImpl<MemOpInfo> &MemOpRecords);
};
class StoreClusterMutation : public BaseMemOpClusterMutation {
@ -1566,63 +1569,53 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
} // end namespace llvm
// Sorting all the loads/stores first, then for each load/store, checking the
// following load/store one by one, until reach the first non-dependent one and
// call target hook to see if they can cluster.
void BaseMemOpClusterMutation::clusterNeighboringMemOps(
ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
SmallVector<MemOpInfo, 32> MemOpRecords;
for (SUnit *SU : MemOps) {
const MachineInstr &MI = *SU->getInstr();
SmallVector<const MachineOperand *, 4> BaseOps;
int64_t Offset;
bool OffsetIsScalable;
unsigned Width;
if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
OffsetIsScalable, Width, TRI)) {
MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
<< Offset << ", OffsetIsScalable: " << OffsetIsScalable
<< ", Width: " << Width << "\n");
}
#ifndef NDEBUG
for (auto *Op : BaseOps)
assert(Op);
#endif
}
if (MemOpRecords.size() < 2)
return;
llvm::sort(MemOpRecords);
ArrayRef<MemOpInfo> MemOpRecords, ScheduleDAGInstrs *DAG) {
// Keep track of the current cluster length and bytes for each SUnit.
DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
// At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
// cluster mem ops collected within `MemOpRecords` array.
unsigned ClusterLength = 1;
unsigned CurrentClusterBytes = MemOpRecords[0].Width;
for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
// Decision to cluster mem ops is taken based on target dependent logic
auto MemOpa = MemOpRecords[Idx];
auto MemOpb = MemOpRecords[Idx + 1];
++ClusterLength;
CurrentClusterBytes += MemOpb.Width;
if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
CurrentClusterBytes)) {
// Current mem ops pair could not be clustered, reset cluster length, and
// go to next pair
ClusterLength = 1;
CurrentClusterBytes = MemOpb.Width;
// Seek for the next load/store to do the cluster.
unsigned NextIdx = Idx + 1;
for (; NextIdx < End; ++NextIdx)
// Skip if MemOpb has been clustered already or has dependency with
// MemOpa.
if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
!DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))
break;
if (NextIdx == End)
continue;
auto MemOpb = MemOpRecords[NextIdx];
unsigned ClusterLength = 2;
unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
CurrentClusterBytes =
SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
}
if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
CurrentClusterBytes))
continue;
SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;
if (SUa->NodeNum > SUb->NodeNum)
std::swap(SUa, SUb);
// FIXME: Is this check really required?
if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
ClusterLength = 1;
CurrentClusterBytes = MemOpb.Width;
if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
continue;
}
LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
<< SUb->NodeNum << ")\n");
@ -1656,42 +1649,57 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
}
}
SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
CurrentClusterBytes};
LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength
<< ", Curr cluster bytes: " << CurrentClusterBytes
<< "\n");
}
}
/// Callback from DAG postProcessing to create cluster edges for loads.
void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
// Map DAG NodeNum to a set of dependent MemOps in store chain.
DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;
for (SUnit &SU : DAG->SUnits) {
void BaseMemOpClusterMutation::collectMemOpRecords(
std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
for (auto &SU : SUnits) {
if ((IsLoad && !SU.getInstr()->mayLoad()) ||
(!IsLoad && !SU.getInstr()->mayStore()))
continue;
unsigned ChainPredID = DAG->SUnits.size();
for (const SDep &Pred : SU.Preds) {
// We only want to cluster the mem ops that have the same ctrl(non-data)
// pred so that they didn't have ctrl dependency for each other. But for
// store instrs, we can still cluster them if the pred is load instr.
if ((Pred.isCtrl() &&
(IsLoad ||
(Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
!Pred.isArtificial()) {
ChainPredID = Pred.getSUnit()->NodeNum;
break;
}
}
// Insert the SU to corresponding store chain.
auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;
Chain.push_back(&SU);
}
const MachineInstr &MI = *SU.getInstr();
SmallVector<const MachineOperand *, 4> BaseOps;
int64_t Offset;
bool OffsetIsScalable;
unsigned Width;
if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
OffsetIsScalable, Width, TRI)) {
MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
// Iterate over the store chains.
for (auto &SCD : StoreChains)
clusterNeighboringMemOps(SCD.second, DAG);
LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
<< Offset << ", OffsetIsScalable: " << OffsetIsScalable
<< ", Width: " << Width << "\n");
}
#ifndef NDEBUG
for (auto *Op : BaseOps)
assert(Op);
#endif
}
}
/// Callback from DAG postProcessing to create cluster edges for loads/stores.
void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
// Collect all the clusterable loads/stores
SmallVector<MemOpInfo, 32> MemOpRecords;
collectMemOpRecords(DAG->SUnits, MemOpRecords);
if (MemOpRecords.size() < 2)
return;
// Sorting the loads/stores, so that, we can stop the cluster as early as
// possible.
llvm::sort(MemOpRecords);
// Trying to cluster all the neighboring loads/stores.
clusterNeighboringMemOps(MemOpRecords, DAG);
}
//===----------------------------------------------------------------------===//

View File

@ -214,11 +214,11 @@ entry:
ret void
}
; FIXME - The SU(4) and SU(7) can be clustered even with
; Verify that the SU(4) and SU(7) can be clustered even with
; different preds
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: cluster_with_different_preds:%bb.0
; CHECK-NOT:Cluster ld/st SU(4) - SU(7)
; CHECK:Cluster ld/st SU(4) - SU(7)
; CHECK:SU(3): STRWui %2:gpr32, %0:gpr64common, 0 ::
; CHECK:SU(4): %3:gpr32 = LDRWui %1:gpr64common, 0 ::
; CHECK:Predecessors:

View File

@ -624,9 +624,9 @@ define void @too_many_args_use_workitem_id_x_byval(
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
@ -669,8 +669,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
; FIXED-ABI-NOT: v31
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}

View File

@ -145,14 +145,14 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4
; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
; GFX9-NEXT: global_load_dword v4, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: global_load_dword v3, v0, s[0:1]
; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4
; GFX9-NEXT: global_load_dword v4, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_pk_max_i16 v1, v2, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_i16 v3, v3, v4
; GFX9-NEXT: v_pk_max_i16 v3, v4, v3
; GFX9-NEXT: global_store_short v0, v1, s[4:5] offset:4
; GFX9-NEXT: global_store_dword v0, v3, s[4:5]
; GFX9-NEXT: s_endpgm

View File

@ -160,16 +160,14 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3
; GCN: s_mov_b32 s34, s32
; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
; GCN: s_mov_b32 s34, s32
; GCN: v_mov_b32_e32 v32, 0
; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
; GCN-NEXT: s_add_u32 s32, s32, 0x30000
; GCN: v_mov_b32_e32 v33, 0
; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]