[Scheduling] Implement a new way to cluster loads/stores

author QingShan Zhang <qshanz@cn.ibm.com>

Wed, 26 Aug 2020 12:26:21 +0000 (12:26 +0000)

committer QingShan Zhang <qshanz@cn.ibm.com>

Wed, 26 Aug 2020 12:33:59 +0000 (12:33 +0000)
author QingShan Zhang <qshanz@cn.ibm.com>
Wed, 26 Aug 2020 12:26:21 +0000 (12:26 +0000)
committer QingShan Zhang <qshanz@cn.ibm.com>
Wed, 26 Aug 2020 12:33:59 +0000 (12:33 +0000)
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h

index 1eb9b9f..d2b9520 100644 (file)
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -268,6 +268,11 @@ namespace llvm {
        return SU->SchedClass;
      }
  
+    /// IsReachable - Checks if SU is reachable from TargetSU.
+    bool IsReachable(SUnit *SU, SUnit *TargetSU) {
+      return Topo.IsReachable(SU, TargetSU);
+    }
+
      /// Returns an iterator to the top of the current scheduling region.
      MachineBasicBlock::iterator begin() const { return RegionBegin; }
  
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp

index a8ccf26..b6d0d9a 100644 (file)
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1530,7 +1530,10 @@ public:
    void apply(ScheduleDAGInstrs *DAGInstrs) override;
  
  protected:
-  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG);
+  void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps,
+                                ScheduleDAGInstrs *DAG);
+  void collectMemOpRecords(std::vector<SUnit> &SUnits,
+                           SmallVectorImpl<MemOpInfo> &MemOpRecords);
  };
  
  class StoreClusterMutation : public BaseMemOpClusterMutation {
@@ -1566,63 +1569,53 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
  
  } // end namespace llvm
  
+// Sorting all the loads/stores first, then for each load/store, checking the
+// following load/store one by one, until reach the first non-dependent one and
+// call target hook to see if they can cluster.
  void BaseMemOpClusterMutation::clusterNeighboringMemOps(
-    ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
-  SmallVector<MemOpInfo, 32> MemOpRecords;
-  for (SUnit *SU : MemOps) {
-    const MachineInstr &MI = *SU->getInstr();
-    SmallVector<const MachineOperand *, 4> BaseOps;
-    int64_t Offset;
-    bool OffsetIsScalable;
-    unsigned Width;
-    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
-                                           OffsetIsScalable, Width, TRI)) {
-      MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
-
-      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
-                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
-                        << ", Width: " << Width << "\n");
-    }
-#ifndef NDEBUG
-    for (auto *Op : BaseOps)
-      assert(Op);
-#endif
-  }
-  if (MemOpRecords.size() < 2)
-    return;
-
-  llvm::sort(MemOpRecords);
+    ArrayRef<MemOpInfo> MemOpRecords, ScheduleDAGInstrs *DAG) {
+  // Keep track of the current cluster length and bytes for each SUnit.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
  
    // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
    // cluster mem ops collected within `MemOpRecords` array.
-  unsigned ClusterLength = 1;
-  unsigned CurrentClusterBytes = MemOpRecords[0].Width;
    for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
      // Decision to cluster mem ops is taken based on target dependent logic
      auto MemOpa = MemOpRecords[Idx];
-    auto MemOpb = MemOpRecords[Idx + 1];
-    ++ClusterLength;
-    CurrentClusterBytes += MemOpb.Width;
-    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
-                                  CurrentClusterBytes)) {
-      // Current mem ops pair could not be clustered, reset cluster length, and
-      // go to next pair
-      ClusterLength = 1;
-      CurrentClusterBytes = MemOpb.Width;
+
+    // Seek for the next load/store to do the cluster.
+    unsigned NextIdx = Idx + 1;
+    for (; NextIdx < End; ++NextIdx)
+      // Skip if MemOpb has been clustered already or has dependency with
+      // MemOpa.
+      if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
+          !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
+          !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))
+        break;
+    if (NextIdx == End)
        continue;
+
+    auto MemOpb = MemOpRecords[NextIdx];
+    unsigned ClusterLength = 2;
+    unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
+    if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
+      ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
+      CurrentClusterBytes =
+          SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
      }
  
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+                                  CurrentClusterBytes))
+      continue;
+
      SUnit *SUa = MemOpa.SU;
      SUnit *SUb = MemOpb.SU;
      if (SUa->NodeNum > SUb->NodeNum)
        std::swap(SUa, SUb);
  
      // FIXME: Is this check really required?
-    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-      ClusterLength = 1;
-      CurrentClusterBytes = MemOpb.Width;
+    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
        continue;
-    }
  
      LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                        << SUb->NodeNum << ")\n");
@@ -1656,42 +1649,57 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
        }
      }
  
+    SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
+                                             CurrentClusterBytes};
+
      LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
                        << ", Curr cluster bytes: " << CurrentClusterBytes
                        << "\n");
    }
  }
  
-/// Callback from DAG postProcessing to create cluster edges for loads.
-void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
-  // Map DAG NodeNum to a set of dependent MemOps in store chain.
-  DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;
-  for (SUnit &SU : DAG->SUnits) {
+void BaseMemOpClusterMutation::collectMemOpRecords(
+    std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
+  for (auto &SU : SUnits) {
      if ((IsLoad && !SU.getInstr()->mayLoad()) ||
          (!IsLoad && !SU.getInstr()->mayStore()))
        continue;
  
-    unsigned ChainPredID = DAG->SUnits.size();
-    for (const SDep &Pred : SU.Preds) {
-      // We only want to cluster the mem ops that have the same ctrl(non-data)
-      // pred so that they didn't have ctrl dependency for each other. But for
-      // store instrs, we can still cluster them if the pred is load instr.
-      if ((Pred.isCtrl() &&
-           (IsLoad ||
-            (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
-          !Pred.isArtificial()) {
-        ChainPredID = Pred.getSUnit()->NodeNum;
-        break;
-      }
+    const MachineInstr &MI = *SU.getInstr();
+    SmallVector<const MachineOperand *, 4> BaseOps;
+    int64_t Offset;
+    bool OffsetIsScalable;
+    unsigned Width;
+    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+                                           OffsetIsScalable, Width, TRI)) {
+      MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+
+      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
+                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
+                        << ", Width: " << Width << "\n");
      }
-    // Insert the SU to corresponding store chain.
-    auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;
-    Chain.push_back(&SU);
+#ifndef NDEBUG
+    for (auto *Op : BaseOps)
+      assert(Op);
+#endif
    }
+}
+
+/// Callback from DAG postProcessing to create cluster edges for loads/stores.
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
+  // Collect all the clusterable loads/stores
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  collectMemOpRecords(DAG->SUnits, MemOpRecords);
+
+  if (MemOpRecords.size() < 2)
+    return;
+
+  // Sorting the loads/stores, so that, we can stop the cluster as early as
+  // possible.
+  llvm::sort(MemOpRecords);
  
-  // Iterate over the store chains.
-  for (auto &SCD : StoreChains)
-    clusterNeighboringMemOps(SCD.second, DAG);
+  // Trying to cluster all the neighboring loads/stores.
+  clusterNeighboringMemOps(MemOpRecords, DAG);
  }
  
  //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll

index b0ed3d0..e953215 100644 (file)
--- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -214,11 +214,11 @@ entry:
    ret void
  }
  
-; FIXME - The SU(4) and SU(7) can be clustered even with
+; Verify that the SU(4) and SU(7) can be clustered even with
  ; different preds
  ; CHECK: ********** MI Scheduling **********
  ; CHECK-LABEL: cluster_with_different_preds:%bb.0
-; CHECK-NOT:Cluster ld/st SU(4) - SU(7)
+; CHECK:Cluster ld/st SU(4) - SU(7)
  ; CHECK:SU(3):   STRWui %2:gpr32, %0:gpr64common, 0 ::
  ; CHECK:SU(4):   %3:gpr32 = LDRWui %1:gpr64common, 0 ::
  ; CHECK:Predecessors:
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll

index 2dc47ca..d23538c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -624,9 +624,9 @@ define void @too_many_args_use_workitem_id_x_byval(
  
  
  ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
  ; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
  ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
  
  ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
  
@@ -669,8 +669,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
  
  ; FIXED-ABI-NOT: v31
  ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
-; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
  ; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
+; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
  ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
  ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
  
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll

index c64e400..7c4ce5d 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -145,14 +145,14 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
  ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
  ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
  ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_short_d16 v2, v0, s[6:7] offset:4
  ; GFX9-NEXT:    global_load_short_d16 v1, v0, s[0:1] offset:4
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_load_short_d16 v2, v0, s[6:7] offset:4
+; GFX9-NEXT:    global_load_dword v4, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
  ; GFX9-NEXT:    v_pk_max_i16 v1, v2, v1
  ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_max_i16 v3, v3, v4
+; GFX9-NEXT:    v_pk_max_i16 v3, v4, v3
  ; GFX9-NEXT:    global_store_short v0, v1, s[4:5] offset:4
  ; GFX9-NEXT:    global_store_dword v0, v3, s[4:5]
  ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll

index 74b5380..e8e3518 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -160,16 +160,14 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
  ; GCN-NEXT: s_mov_b64 exec, s[4:5]
  ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
  ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3
-; GCN: s_mov_b32 s34, s32
  ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
  ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+; GCN: s_mov_b32 s34, s32
+; GCN: v_mov_b32_e32 v32, 0
+; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
  ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
  ; GCN-NEXT: s_add_u32 s32, s32, 0x30000
  
-; GCN: v_mov_b32_e32 v33, 0
-
-; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024
-
  ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
  ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
author	QingShan Zhang <qshanz@cn.ibm.com>
	Wed, 26 Aug 2020 12:26:21 +0000 (12:26 +0000)
committer	QingShan Zhang <qshanz@cn.ibm.com>
	Wed, 26 Aug 2020 12:33:59 +0000 (12:33 +0000)
llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h		patch \| blob \| history
llvm/lib/CodeGen/MachineScheduler.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/max.i16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/stack-realign.ll		patch \| blob \| history