[Scheduling] Implement a new way to cluster loads/stores

Before calling target hook to determine if two loads/stores are clusterable, we put them into different groups to avoid fake cluster due to dependency. For now, we are putting the loads/stores into the same group if they have the same predecessor. We assume that, if two loads/stores have the same predecessor, it is likely that, they didn't have dependency for each other. However, one SUnit might have several predecessors and for now, we just pick up the first predecessor that has non-data/non-artificial dependency, which is too arbitrary. And we are struggling to fix it. So, I am proposing some better implementation. 1. Collect all the loads/stores that has memory info first to reduce the complexity. 2. Sort these loads/stores so that we can stop the seeking as early as possible. 3. For each load/store, seeking for the first non-dependency instruction with the sorted order, and check if they can cluster or not. Reviewed By: Jay Foad Differential Revision: https://reviews.llvm.org/D85517
2024-11-24 20:09:46 +00:00 · 2020-08-26 12:26:21 +00:00 · 2020-08-26 12:26:21 +00:00 · 48664955df
commit 48664955df
parent 70f1e167ac
6 changed files with 91 additions and 80 deletions
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@ -268,6 +268,11 @@ namespace llvm {
      return SU->SchedClass;
    }

+    /// IsReachable - Checks if SU is reachable from TargetSU.
+    bool IsReachable(SUnit *SU, SUnit *TargetSU) {
+      return Topo.IsReachable(SU, TargetSU);
+    }
+
    /// Returns an iterator to the top of the current scheduling region.
    MachineBasicBlock::iterator begin() const { return RegionBegin; }

--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@ -1530,7 +1530,10 @@ public:
  void apply(ScheduleDAGInstrs *DAGInstrs) override;

 protected:
-  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG);
+  void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps,
+                                ScheduleDAGInstrs *DAG);
+  void collectMemOpRecords(std::vector<SUnit> &SUnits,
+                           SmallVectorImpl<MemOpInfo> &MemOpRecords);
 };

 class StoreClusterMutation : public BaseMemOpClusterMutation {
@ -1566,63 +1569,53 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,

 } // end namespace llvm

+// Sorting all the loads/stores first, then for each load/store, checking the
+// following load/store one by one, until reach the first non-dependent one and
+// call target hook to see if they can cluster.
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
-    ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
-  SmallVector<MemOpInfo, 32> MemOpRecords;
-  for (SUnit *SU : MemOps) {
-    const MachineInstr &MI = *SU->getInstr();
-    SmallVector<const MachineOperand *, 4> BaseOps;
-    int64_t Offset;
-    bool OffsetIsScalable;
-    unsigned Width;
-    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
-                                           OffsetIsScalable, Width, TRI)) {
-      MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
-
-      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
-                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
-                        << ", Width: " << Width << "\n");
-    }
-#ifndef NDEBUG
-    for (auto *Op : BaseOps)
-      assert(Op);
-#endif
-  }
-  if (MemOpRecords.size() < 2)
-    return;
-
-  llvm::sort(MemOpRecords);
+    ArrayRef<MemOpInfo> MemOpRecords, ScheduleDAGInstrs *DAG) {
+  // Keep track of the current cluster length and bytes for each SUnit.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;

  // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
  // cluster mem ops collected within `MemOpRecords` array.
-  unsigned ClusterLength = 1;
-  unsigned CurrentClusterBytes = MemOpRecords[0].Width;
  for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
    // Decision to cluster mem ops is taken based on target dependent logic
    auto MemOpa = MemOpRecords[Idx];
-    auto MemOpb = MemOpRecords[Idx + 1];
-    ++ClusterLength;
-    CurrentClusterBytes += MemOpb.Width;
-    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
-                                  CurrentClusterBytes)) {
-      // Current mem ops pair could not be clustered, reset cluster length, and
-      // go to next pair
-      ClusterLength = 1;
-      CurrentClusterBytes = MemOpb.Width;
+
+    // Seek for the next load/store to do the cluster.
+    unsigned NextIdx = Idx + 1;
+    for (; NextIdx < End; ++NextIdx)
+      // Skip if MemOpb has been clustered already or has dependency with
+      // MemOpa.
+      if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
+          !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
+          !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))
+        break;
+    if (NextIdx == End)
      continue;
+
+    auto MemOpb = MemOpRecords[NextIdx];
+    unsigned ClusterLength = 2;
+    unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
+    if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
+      ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
+      CurrentClusterBytes =
+          SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
    }

+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+                                  CurrentClusterBytes))
+      continue;
+
    SUnit *SUa = MemOpa.SU;
    SUnit *SUb = MemOpb.SU;
    if (SUa->NodeNum > SUb->NodeNum)
      std::swap(SUa, SUb);

    // FIXME: Is this check really required?
-    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-      ClusterLength = 1;
-      CurrentClusterBytes = MemOpb.Width;
+    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
      continue;
-    }

    LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                      << SUb->NodeNum << ")\n");
@ -1656,42 +1649,57 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
      }
    }

+    SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
+                                             CurrentClusterBytes};
+
    LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
                      << ", Curr cluster bytes: " << CurrentClusterBytes
                      << "\n");
  }
 }

-/// Callback from DAG postProcessing to create cluster edges for loads.
-void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
-  // Map DAG NodeNum to a set of dependent MemOps in store chain.
-  DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;
-  for (SUnit &SU : DAG->SUnits) {
+void BaseMemOpClusterMutation::collectMemOpRecords(
+    std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
+  for (auto &SU : SUnits) {
    if ((IsLoad && !SU.getInstr()->mayLoad()) ||
        (!IsLoad && !SU.getInstr()->mayStore()))
      continue;

-    unsigned ChainPredID = DAG->SUnits.size();
-    for (const SDep &Pred : SU.Preds) {
-      // We only want to cluster the mem ops that have the same ctrl(non-data)
-      // pred so that they didn't have ctrl dependency for each other. But for
-      // store instrs, we can still cluster them if the pred is load instr.
-      if ((Pred.isCtrl() &&
-           (IsLoad ||
-            (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
-          !Pred.isArtificial()) {
-        ChainPredID = Pred.getSUnit()->NodeNum;
-        break;
-      }
-    }
-    // Insert the SU to corresponding store chain.
-    auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;
-    Chain.push_back(&SU);
-  }
+    const MachineInstr &MI = *SU.getInstr();
+    SmallVector<const MachineOperand *, 4> BaseOps;
+    int64_t Offset;
+    bool OffsetIsScalable;
+    unsigned Width;
+    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+                                           OffsetIsScalable, Width, TRI)) {
+      MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));

-  // Iterate over the store chains.
-  for (auto &SCD : StoreChains)
-    clusterNeighboringMemOps(SCD.second, DAG);
+      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
+                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
+                        << ", Width: " << Width << "\n");
+    }
+#ifndef NDEBUG
+    for (auto *Op : BaseOps)
+      assert(Op);
+#endif
+  }
+}
+
+/// Callback from DAG postProcessing to create cluster edges for loads/stores.
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
+  // Collect all the clusterable loads/stores
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  collectMemOpRecords(DAG->SUnits, MemOpRecords);
+
+  if (MemOpRecords.size() < 2)
+    return;
+
+  // Sorting the loads/stores, so that, we can stop the cluster as early as
+  // possible.
+  llvm::sort(MemOpRecords);
+
+  // Trying to cluster all the neighboring loads/stores.
+  clusterNeighboringMemOps(MemOpRecords, DAG);
 }

 //===----------------------------------------------------------------------===//
--- a/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@ -214,11 +214,11 @@ entry:
  ret void
 }

-; FIXME - The SU(4) and SU(7) can be clustered even with
+; Verify that the SU(4) and SU(7) can be clustered even with
 ; different preds
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: cluster_with_different_preds:%bb.0
-; CHECK-NOT:Cluster ld/st SU(4) - SU(7)
+; CHECK:Cluster ld/st SU(4) - SU(7)
 ; CHECK:SU(3):   STRWui %2:gpr32, %0:gpr64common, 0 ::
 ; CHECK:SU(4):   %3:gpr32 = LDRWui %1:gpr64common, 0 ::
 ; CHECK:Predecessors:
--- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@ -624,9 +624,9 @@ define void @too_many_args_use_workitem_id_x_byval(


 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
 ; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}

 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}

@ -669,8 +669,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1

 ; FIXED-ABI-NOT: v31
 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
-; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
 ; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
+; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
 ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}

--- a/test/CodeGen/AMDGPU/max.i16.ll
+++ b/test/CodeGen/AMDGPU/max.i16.ll
@ -145,14 +145,14 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_short_d16 v2, v0, s[6:7] offset:4
 ; GFX9-NEXT:    global_load_short_d16 v1, v0, s[0:1] offset:4
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_load_short_d16 v2, v0, s[6:7] offset:4
+; GFX9-NEXT:    global_load_dword v4, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_pk_max_i16 v1, v2, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_max_i16 v3, v3, v4
+; GFX9-NEXT:    v_pk_max_i16 v3, v4, v3
 ; GFX9-NEXT:    global_store_short v0, v1, s[4:5] offset:4
 ; GFX9-NEXT:    global_store_dword v0, v3, s[4:5]
 ; GFX9-NEXT:    s_endpgm
--- a/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/test/CodeGen/AMDGPU/stack-realign.ll
@ -160,16 +160,14 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3
-; GCN: s_mov_b32 s34, s32
 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+; GCN: s_mov_b32 s34, s32
+; GCN: v_mov_b32_e32 v32, 0
+; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000

-; GCN: v_mov_b32_e32 v33, 0
-
-; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024
-
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]