[AMDGPU] Add GCNMaxILPSchedStrategy

author Austin Kerbow <Austin.Kerbow@amd.com>

Sat, 30 Jul 2022 14:40:11 +0000 (07:40 -0700)

committer Austin Kerbow <Austin.Kerbow@amd.com>

Tue, 2 Aug 2022 20:21:24 +0000 (13:21 -0700)
author Austin Kerbow <Austin.Kerbow@amd.com>
Sat, 30 Jul 2022 14:40:11 +0000 (07:40 -0700)
committer Austin Kerbow <Austin.Kerbow@amd.com>
Tue, 2 Aug 2022 20:21:24 +0000 (13:21 -0700)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 1faf910..643eb0b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -427,6 +427,15 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
  }
  
  static ScheduleDAGInstrs *
+createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
+  ScheduleDAGMILive *DAG =
+      new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
+  DAG->addMutation(createIGroupLPDAGMutation());
+  DAG->addMutation(createSchedBarrierDAGMutation());
+  return DAG;
+}
+
+static ScheduleDAGInstrs *
  createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
    const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
    auto DAG = new GCNIterativeScheduler(C,
@@ -464,19 +473,23 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
                               createGCNMaxOccupancyMachineScheduler);
  
  static MachineSchedRegistry
-IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
-  "Run GCN scheduler to maximize occupancy (experimental)",
-  createIterativeGCNMaxOccupancyMachineScheduler);
-
-static MachineSchedRegistry
-GCNMinRegSchedRegistry("gcn-minreg",
-  "Run GCN iterative scheduler for minimal register usage (experimental)",
-  createMinRegScheduler);
-
-static MachineSchedRegistry
-GCNILPSchedRegistry("gcn-ilp",
-  "Run GCN iterative scheduler for ILP scheduling (experimental)",
-  createIterativeILPMachineScheduler);
+    GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
+                           createGCNMaxILPMachineScheduler);
+
+static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
+    "gcn-iterative-max-occupancy-experimental",
+    "Run GCN scheduler to maximize occupancy (experimental)",
+    createIterativeGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry GCNMinRegSchedRegistry(
+    "gcn-iterative-minreg",
+    "Run GCN iterative scheduler for minimal register usage (experimental)",
+    createMinRegScheduler);
+
+static MachineSchedRegistry GCNILPSchedRegistry(
+    "gcn-iterative-ilp",
+    "Run GCN iterative scheduler for ILP scheduling (experimental)",
+    createIterativeILPMachineScheduler);
  
  static StringRef computeDataLayout(const Triple &TT) {
    if (TT.getArch() == Triple::r600) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

index 4b90513..1d83f2e 100644 (file)
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -38,12 +38,11 @@ cl::opt<bool>
                                      "reduction scheduling stage."),
                             cl::init(false));
  
-GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
-    const MachineSchedContext *C)
+GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
      : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
        HasHighPressure(false) {}
  
-void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
+void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
    GenericScheduler::initialize(DAG);
  
    MF = &DAG->MF;
@@ -74,8 +73,9 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
    VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit);
  }
  
-void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
-                                     bool AtTop, const RegPressureTracker &RPTracker,
+void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
+                                     bool AtTop,
+                                     const RegPressureTracker &RPTracker,
                                       const SIRegisterInfo *SRI,
                                       unsigned SGPRPressure,
                                       unsigned VGPRPressure) {
@@ -161,7 +161,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
  
  // This function is mostly cut and pasted from
  // GenericScheduler::pickNodeFromQueue()
-void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
+void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                           const CandPolicy &ZonePolicy,
                                           const RegPressureTracker &RPTracker,
                                           SchedCandidate &Cand) {
@@ -181,7 +181,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                    SGPRPressure, VGPRPressure);
      // Pass SchedBoundary only when comparing nodes from the same boundary.
      SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
-    GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
+    tryCandidate(Cand, TryCand, ZoneArg);
      if (TryCand.Reason != NoCand) {
        // Initialize resource delta if needed in case future heuristics query it.
        if (TryCand.ResDelta == SchedResourceDelta())
@@ -194,7 +194,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
  
  // This function is mostly cut and pasted from
  // GenericScheduler::pickNodeBidirectional()
-SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
    // Schedule as far as possible in the direction of no choice. This is most
    // efficient, but also provides the best heuristics for CriticalPSets.
    if (SUnit *SU = Bot.pickOnlyChoice()) {
@@ -259,7 +259,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
               dbgs() << "Bot Cand: "; traceCandidate(BotCand););
    SchedCandidate Cand = BotCand;
    TopCand.Reason = NoCand;
-  GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+  tryCandidate(Cand, TopCand, nullptr);
    if (TopCand.Reason != NoCand) {
      Cand.setBest(TopCand);
    }
@@ -271,7 +271,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
  
  // This function is mostly cut and pasted from
  // GenericScheduler::pickNode()
-SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
    if (DAG->top() == DAG->bottom()) {
      assert(Top.Available.empty() && Top.Pending.empty() &&
             Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
@@ -314,6 +314,129 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
    return SU;
  }
  
+GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
+  assert(CurrentStage && CurrentStage != SchedStages.end());
+  return *CurrentStage;
+}
+
+bool GCNSchedStrategy::advanceStage() {
+  assert(CurrentStage != SchedStages.end());
+  if (!CurrentStage)
+    CurrentStage = SchedStages.begin();
+  else
+    CurrentStage++;
+
+  return CurrentStage != SchedStages.end();
+}
+
+bool GCNSchedStrategy::hasNextStage() const {
+  assert(CurrentStage);
+  return std::next(CurrentStage) != SchedStages.end();
+}
+
+GCNSchedStageID GCNSchedStrategy::getNextStage() const {
+  assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
+  return *std::next(CurrentStage);
+}
+
+GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
+    const MachineSchedContext *C)
+    : GCNSchedStrategy(C) {
+  SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+  SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
+  SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
+  SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
+}
+
+GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
+    : GCNSchedStrategy(C) {
+  SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
+}
+
+bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
+                                          SchedCandidate &TryCand,
+                                          SchedBoundary *Zone) const {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Avoid spilling by exceeding the register limit.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+                  RegExcess, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+    return TryCand.Reason != NoCand;
+
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    // Prioritize instructions that read unbuffered resources by stall cycles.
+    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+      return TryCand.Reason != NoCand;
+
+    // Avoid critical resource consumption and balance the schedule.
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return TryCand.Reason != NoCand;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources, TryCand, Cand,
+                   ResourceDemand))
+      return TryCand.Reason != NoCand;
+
+    // Unconditionally try to reduce latency.
+    if (tryLatency(TryCand, Cand, *Zone))
+      return TryCand.Reason != NoCand;
+
+    // Weak edges are for clustering and other constraints.
+    if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+                getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+      return TryCand.Reason != NoCand;
+  }
+
+  // Keep clustered nodes together to encourage downstream peephole
+  // optimizations which may reduce resource requirements.
+  //
+  // This is a best effort to set things up for a post-RA pass. Optimizations
+  // like generating loads of multiple registers should ideally be done within
+  // the scheduler pass by combining the loads during DAG postprocessing.
+  const SUnit *CandNextClusterSU =
+      Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  const SUnit *TryCandNextClusterSU =
+      TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+                 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+    return TryCand.Reason != NoCand;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+                  TryCand, Cand, RegCritical, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  // Avoid increasing the max pressure of the entire region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+                  Cand, RegMax, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  if (SameBoundary) {
+    // Fall through to original instruction order.
+    if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
+        (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+      TryCand.Reason = NodeOrder;
+      return true;
+    }
+  }
+  return false;
+}
+
  GCNScheduleDAGMILive::GCNScheduleDAGMILive(
      MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
      : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
@@ -323,6 +446,22 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(
    LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
  }
  
+std::unique_ptr<GCNSchedStage>
+GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
+  switch (SchedStageID) {
+  case GCNSchedStageID::OccInitialSchedule:
+    return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
+  case GCNSchedStageID::UnclusteredHighRPReschedule:
+    return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
+  case GCNSchedStageID::ClusteredLowOccupancyReschedule:
+    return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
+  case GCNSchedStageID::PreRARematerialize:
+    return std::make_unique<PreRARematStage>(SchedStageID, *this);
+  case GCNSchedStageID::ILPInitialSchedule:
+    return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
+  }
+}
+
  void GCNScheduleDAGMILive::schedule() {
    // Collect all scheduling regions. The actual scheduling is performed in
    // GCNScheduleDAGMILive::finalizeSchedule.
@@ -439,18 +578,13 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
  
  void GCNScheduleDAGMILive::runSchedStages() {
    LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
-  InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
-  UnclusteredHighRPStage S1(GCNSchedStageID::UnclusteredHighRPReschedule,
-                            *this);
-  ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
-                          *this);
-  PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
-  GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3};
  
    if (!Regions.empty())
      BBLiveInMap = getBBLiveInMap();
  
-  for (auto *Stage : SchedStages) {
+  GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
+  while (S.advanceStage()) {
+    auto Stage = createSchedStage(S.getCurrentStage());
      if (!Stage->initGCNSchedStage())
        continue;
  
@@ -475,8 +609,8 @@ void GCNScheduleDAGMILive::runSchedStages() {
  #ifndef NDEBUG
  raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
    switch (StageID) {
-  case GCNSchedStageID::InitialSchedule:
-    OS << "Initial Schedule";
+  case GCNSchedStageID::OccInitialSchedule:
+    OS << "Max Occupancy Initial Schedule";
      break;
    case GCNSchedStageID::UnclusteredHighRPReschedule:
      OS << "Unclustered High Register Pressure Reschedule";
@@ -487,14 +621,18 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
    case GCNSchedStageID::PreRARematerialize:
      OS << "Pre-RA Rematerialize";
      break;
+  case GCNSchedStageID::ILPInitialSchedule:
+    OS << "Max ILP Initial Schedule";
+    break;
    }
+
    return OS;
  }
  #endif
  
  GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
-    : DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)),
-      MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
+    : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
+      MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
  
  bool GCNSchedStage::initGCNSchedStage() {
    if (!DAG.LIS)
@@ -564,6 +702,7 @@ bool PreRARematStage::initGCNSchedStage() {
    // inbetween the defs and region we sinked the def to. Cached pressure
    // for regions where a def is sinked from will also be invalidated. Will
    // need to be fixed if there is another pass after this pass.
+  assert(!S.hasNextStage());
  
    collectRematerializableInstructions();
    if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
@@ -674,7 +813,7 @@ void GCNSchedStage::setupNewBlock() {
    DAG.startBlock(CurrentMBB);
    // Get real RP for the region if it hasn't be calculated before. After the
    // initial schedule stage real RP will be collected after scheduling.
-  if (StageID == GCNSchedStageID::InitialSchedule)
+  if (StageID == GCNSchedStageID::OccInitialSchedule)
      DAG.computeBlockPressure(RegionIdx, CurrentMBB);
  }
  
@@ -767,7 +906,7 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
    return false;
  }
  
-bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
    if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
      return true;
  
@@ -810,6 +949,13 @@ bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
    return false;
  }
  
+bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (mayCauseSpilling(WavesAfter))
+    return true;
+
+  return false;
+}
+
  bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
    if (WavesAfter <= MFI.getMinWavesPerEU() &&
        !PressureAfter.less(ST, PressureBefore) &&
@@ -826,7 +972,8 @@ void GCNSchedStage::revertScheduling() {
        PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
    LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
    DAG.RescheduleRegions[RegionIdx] =
-      (nextStage(StageID)) != GCNSchedStageID::UnclusteredHighRPReschedule;
+      S.hasNextStage() &&
+      S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
    DAG.RegionEnd = DAG.RegionBegin;
    int SkippedDebugInstr = 0;
    for (MachineInstr *MI : Unsched) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

index ffa68ba..94d1431 100644 (file)
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -22,12 +22,25 @@ namespace llvm {
  class SIMachineFunctionInfo;
  class SIRegisterInfo;
  class GCNSubtarget;
+class GCNSchedStage;
+
+enum class GCNSchedStageID : unsigned {
+  OccInitialSchedule = 0,
+  UnclusteredHighRPReschedule = 1,
+  ClusteredLowOccupancyReschedule = 2,
+  PreRARematerialize = 3,
+  ILPInitialSchedule = 4
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
+#endif
  
  /// This is a minimal scheduler strategy.  The main difference between this
  /// and the GenericScheduler is that GCNSchedStrategy uses different
-/// heuristics to determine excess/critical pressure sets.  Its goal is to
-/// maximize kernel occupancy (i.e. maximum number of waves per simd).
-class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
+/// heuristics to determine excess/critical pressure sets.
+class GCNSchedStrategy : public GenericScheduler {
+protected:
    SUnit *pickNodeBidirectional(bool &IsTopNode);
  
    void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
@@ -51,6 +64,12 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
  
    MachineFunction *MF;
  
+  // Scheduling stages for this strategy.
+  SmallVector<GCNSchedStageID, 4> SchedStages;
+
+  // Pointer to the current SchedStageID.
+  SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
+
  public:
    // schedule() have seen register pressure over the critical limits and had to
    // track register pressure for actual scheduling heuristics.
@@ -69,7 +88,7 @@ public:
  
    unsigned VGPRCriticalLimit;
  
-  GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+  GCNSchedStrategy(const MachineSchedContext *C);
  
    SUnit *pickNode(bool &IsTopNode) override;
  
@@ -78,40 +97,42 @@ public:
    unsigned getTargetOccupancy() { return TargetOccupancy; }
  
    void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
-};
  
-enum class GCNSchedStageID : unsigned {
-  InitialSchedule = 0,
-  UnclusteredHighRPReschedule = 1,
-  ClusteredLowOccupancyReschedule = 2,
-  PreRARematerialize = 3,
-  LastStage = PreRARematerialize
-};
+  GCNSchedStageID getCurrentStage();
  
-#ifndef NDEBUG
-raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
-#endif
+  // Advances stage. Returns true if there are remaining stages.
+  bool advanceStage();
  
-inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) {
-  assert(Stage != GCNSchedStageID::PreRARematerialize);
-  Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
-  return Stage;
-}
+  bool hasNextStage() const;
+
+  GCNSchedStageID getNextStage() const;
+};
  
-inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) {
-  return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
-}
+/// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
+/// maximum number of waves per simd).
+class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
+public:
+  GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+};
  
-inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) {
-  return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS);
-}
+/// The goal of this scheduling strategy is to maximize ILP for a single wave
+/// (i.e. latency hiding).
+class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
+protected:
+  bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                    SchedBoundary *Zone) const override;
+
+public:
+  GCNMaxILPSchedStrategy(const MachineSchedContext *C);
+};
  
  class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
    friend class GCNSchedStage;
-  friend class InitialScheduleStage;
+  friend class OccInitialScheduleStage;
    friend class UnclusteredHighRPStage;
    friend class ClusteredLowOccStage;
    friend class PreRARematStage;
+  friend class ILPInitialScheduleStage;
  
    const GCNSubtarget &ST;
  
@@ -169,6 +190,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
  
    void runSchedStages();
  
+  std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
+
  public:
    GCNScheduleDAGMILive(MachineSchedContext *C,
                         std::unique_ptr<MachineSchedStrategy> S);
@@ -183,7 +206,7 @@ class GCNSchedStage {
  protected:
    GCNScheduleDAGMILive &DAG;
  
-  GCNMaxOccupancySchedStrategy &S;
+  GCNSchedStrategy &S;
  
    MachineFunction &MF;
  
@@ -245,11 +268,11 @@ public:
    virtual ~GCNSchedStage() = default;
  };
  
-class InitialScheduleStage : public GCNSchedStage {
+class OccInitialScheduleStage : public GCNSchedStage {
  public:
    bool shouldRevertScheduling(unsigned WavesAfter) override;
  
-  InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+  OccInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
        : GCNSchedStage(StageID, DAG) {}
  };
  
@@ -324,6 +347,14 @@ public:
        : GCNSchedStage(StageID, DAG) {}
  };
  
+class ILPInitialScheduleStage : public GCNSchedStage {
+public:
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
  } // End namespace llvm
  
  #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll

index 437e3a7..ea5062c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-ilp -verify-machineinstrs < %s | FileCheck %s
  
  ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
  
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll

index eaa30bb..a9e3951 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
@@ -1,6 +1,6 @@
  ; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
  
  ; We expect a two digit VGPR usage here, not a three digit.
  ; CHECK: NumVgprs: {{[0-9][0-9]$}}
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll

index e209f9e..87b2d29 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s
-; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
  
  ; SI-MINREG: NumSgprs: {{[1-9]$}}
  ; SI-MINREG: NumVgprs: {{[1-9]$}}
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll

index e5f08db..d567e15 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
@@ -1,5 +1,5 @@
  ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s
-; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s
  
  ; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP.
author	Austin Kerbow <Austin.Kerbow@amd.com>
	Sat, 30 Jul 2022 14:40:11 +0000 (07:40 -0700)
committer	Austin Kerbow <Austin.Kerbow@amd.com>
	Tue, 2 Aug 2022 20:21:24 +0000 (13:21 -0700)
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/schedule-ilp.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll		patch \| blob \| history