"reduction scheduling stage."),
cl::init(false));
-GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
- const MachineSchedContext *C)
+GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
HasHighPressure(false) {}
-void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
+void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
MF = &DAG->MF;
VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit);
}
-void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
- bool AtTop, const RegPressureTracker &RPTracker,
+void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
+ bool AtTop,
+ const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
unsigned SGPRPressure,
unsigned VGPRPressure) {
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeFromQueue()
-void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
+void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
SchedCandidate &Cand) {
SGPRPressure, VGPRPressure);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
- GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
+ tryCandidate(Cand, TryCand, ZoneArg);
if (TryCand.Reason != NoCand) {
// Initialize resource delta if needed in case future heuristics query it.
if (TryCand.ResDelta == SchedResourceDelta())
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeBidirectional()
-SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
// Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets.
if (SUnit *SU = Bot.pickOnlyChoice()) {
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
SchedCandidate Cand = BotCand;
TopCand.Reason = NoCand;
- GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+ tryCandidate(Cand, TopCand, nullptr);
if (TopCand.Reason != NoCand) {
Cand.setBest(TopCand);
}
// This function is mostly cut and pasted from
// GenericScheduler::pickNode()
-SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
if (DAG->top() == DAG->bottom()) {
assert(Top.Available.empty() && Top.Pending.empty() &&
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return SU;
}
+GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
+ assert(CurrentStage && CurrentStage != SchedStages.end());
+ return *CurrentStage;
+}
+
+bool GCNSchedStrategy::advanceStage() {
+ assert(CurrentStage != SchedStages.end());
+ if (!CurrentStage)
+ CurrentStage = SchedStages.begin();
+ else
+ CurrentStage++;
+
+ return CurrentStage != SchedStages.end();
+}
+
+bool GCNSchedStrategy::hasNextStage() const {
+ assert(CurrentStage);
+ return std::next(CurrentStage) != SchedStages.end();
+}
+
+GCNSchedStageID GCNSchedStrategy::getNextStage() const {
+ assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
+ return *std::next(CurrentStage);
+}
+
+GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
+ const MachineSchedContext *C)
+ : GCNSchedStrategy(C) {
+ SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+ SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
+ SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
+ SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
+}
+
+GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
+ : GCNSchedStrategy(C) {
+ SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
+}
+
+bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Avoid spilling by exceeding the register limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+ Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+
+ // Avoid critical resource consumption and balance the schedule.
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+
+ // Unconditionally try to reduce latency.
+ if (tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // Weak edges are for clustering and other constraints.
+ if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+ getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Keep clustered nodes together to encourage downstream peephole
+ // optimizations which may reduce resource requirements.
+ //
+ // This is a best effort to set things up for a post-RA pass. Optimizations
+ // like generating loads of multiple registers should ideally be done within
+ // the scheduler pass by combining the loads during DAG postprocessing.
+ const SUnit *CandNextClusterSU =
+ Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ const SUnit *TryCandNextClusterSU =
+ TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+ Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max pressure of the entire region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+ Cand, RegMax, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ if (SameBoundary) {
+ // Fall through to original instruction order.
+ if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
+ (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+ }
+ return false;
+}
+
GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
+std::unique_ptr<GCNSchedStage>
+GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
+ switch (SchedStageID) {
+ case GCNSchedStageID::OccInitialSchedule:
+ return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
+ case GCNSchedStageID::UnclusteredHighRPReschedule:
+ return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
+ case GCNSchedStageID::ClusteredLowOccupancyReschedule:
+ return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
+ case GCNSchedStageID::PreRARematerialize:
+ return std::make_unique<PreRARematStage>(SchedStageID, *this);
+ case GCNSchedStageID::ILPInitialSchedule:
+ return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
+ }
+}
+
void GCNScheduleDAGMILive::schedule() {
// Collect all scheduling regions. The actual scheduling is performed in
// GCNScheduleDAGMILive::finalizeSchedule.
void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
- InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
- UnclusteredHighRPStage S1(GCNSchedStageID::UnclusteredHighRPReschedule,
- *this);
- ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
- *this);
- PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
- GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3};
if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
- for (auto *Stage : SchedStages) {
+ GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
+ while (S.advanceStage()) {
+ auto Stage = createSchedStage(S.getCurrentStage());
if (!Stage->initGCNSchedStage())
continue;
#ifndef NDEBUG
raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
switch (StageID) {
- case GCNSchedStageID::InitialSchedule:
- OS << "Initial Schedule";
+ case GCNSchedStageID::OccInitialSchedule:
+ OS << "Max Occupancy Initial Schedule";
break;
case GCNSchedStageID::UnclusteredHighRPReschedule:
OS << "Unclustered High Register Pressure Reschedule";
case GCNSchedStageID::PreRARematerialize:
OS << "Pre-RA Rematerialize";
break;
+ case GCNSchedStageID::ILPInitialSchedule:
+ OS << "Max ILP Initial Schedule";
+ break;
}
+
return OS;
}
#endif
GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
- : DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)),
- MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
+ : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
+ MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
bool GCNSchedStage::initGCNSchedStage() {
if (!DAG.LIS)
// inbetween the defs and region we sinked the def to. Cached pressure
// for regions where a def is sinked from will also be invalidated. Will
// need to be fixed if there is another pass after this pass.
+ assert(!S.hasNextStage());
collectRematerializableInstructions();
if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
DAG.startBlock(CurrentMBB);
// Get real RP for the region if it hasn't be calculated before. After the
// initial schedule stage real RP will be collected after scheduling.
- if (StageID == GCNSchedStageID::InitialSchedule)
+ if (StageID == GCNSchedStageID::OccInitialSchedule)
DAG.computeBlockPressure(RegionIdx, CurrentMBB);
}
return false;
}
-bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true;
return false;
}
+bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+ if (mayCauseSpilling(WavesAfter))
+ return true;
+
+ return false;
+}
+
bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
if (WavesAfter <= MFI.getMinWavesPerEU() &&
!PressureAfter.less(ST, PressureBefore) &&
PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RescheduleRegions[RegionIdx] =
- (nextStage(StageID)) != GCNSchedStageID::UnclusteredHighRPReschedule;
+ S.hasNextStage() &&
+ S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
DAG.RegionEnd = DAG.RegionBegin;
int SkippedDebugInstr = 0;
for (MachineInstr *MI : Unsched) {
class SIMachineFunctionInfo;
class SIRegisterInfo;
class GCNSubtarget;
+class GCNSchedStage;
+
+enum class GCNSchedStageID : unsigned {
+ OccInitialSchedule = 0,
+ UnclusteredHighRPReschedule = 1,
+ ClusteredLowOccupancyReschedule = 2,
+ PreRARematerialize = 3,
+ ILPInitialSchedule = 4
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
+#endif
/// This is a minimal scheduler strategy. The main difference between this
/// and the GenericScheduler is that GCNSchedStrategy uses different
-/// heuristics to determine excess/critical pressure sets. Its goal is to
-/// maximize kernel occupancy (i.e. maximum number of waves per simd).
-class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
+/// heuristics to determine excess/critical pressure sets.
+class GCNSchedStrategy : public GenericScheduler {
+protected:
SUnit *pickNodeBidirectional(bool &IsTopNode);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
MachineFunction *MF;
+ // Scheduling stages for this strategy.
+ SmallVector<GCNSchedStageID, 4> SchedStages;
+
+ // Pointer to the current SchedStageID.
+ SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
+
public:
// schedule() have seen register pressure over the critical limits and had to
// track register pressure for actual scheduling heuristics.
unsigned VGPRCriticalLimit;
- GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+ GCNSchedStrategy(const MachineSchedContext *C);
SUnit *pickNode(bool &IsTopNode) override;
unsigned getTargetOccupancy() { return TargetOccupancy; }
void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
-};
-enum class GCNSchedStageID : unsigned {
- InitialSchedule = 0,
- UnclusteredHighRPReschedule = 1,
- ClusteredLowOccupancyReschedule = 2,
- PreRARematerialize = 3,
- LastStage = PreRARematerialize
-};
+ GCNSchedStageID getCurrentStage();
-#ifndef NDEBUG
-raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
-#endif
+ // Advances stage. Returns true if there are remaining stages.
+ bool advanceStage();
-inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) {
- assert(Stage != GCNSchedStageID::PreRARematerialize);
- Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
- return Stage;
-}
+ bool hasNextStage() const;
+
+ GCNSchedStageID getNextStage() const;
+};
-inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) {
- return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
-}
+/// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
+/// maximum number of waves per simd).
+class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
+public:
+ GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+};
-inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) {
- return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS);
-}
+/// The goal of this scheduling strategy is to maximize ILP for a single wave
+/// (i.e. latency hiding).
+class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
+protected:
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const override;
+
+public:
+ GCNMaxILPSchedStrategy(const MachineSchedContext *C);
+};
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
- friend class InitialScheduleStage;
+ friend class OccInitialScheduleStage;
friend class UnclusteredHighRPStage;
friend class ClusteredLowOccStage;
friend class PreRARematStage;
+ friend class ILPInitialScheduleStage;
const GCNSubtarget &ST;
void runSchedStages();
+ std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
+
public:
GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S);
protected:
GCNScheduleDAGMILive &DAG;
- GCNMaxOccupancySchedStrategy &S;
+ GCNSchedStrategy &S;
MachineFunction &MF;
virtual ~GCNSchedStage() = default;
};
-class InitialScheduleStage : public GCNSchedStage {
+class OccInitialScheduleStage : public GCNSchedStage {
public:
bool shouldRevertScheduling(unsigned WavesAfter) override;
- InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ OccInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};
: GCNSchedStage(StageID, DAG) {}
};
+class ILPInitialScheduleStage : public GCNSchedStage {
+public:
+ bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+ ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ : GCNSchedStage(StageID, DAG) {}
+};
+
} // End namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H