[AMDGPU] Small cleanups in wait counter code

author Stephen Thomas <Stephen.Thomas@amd.com>

Fri, 28 Oct 2022 09:59:03 +0000 (10:59 +0100)

committer Jay Foad <jay.foad@amd.com>

Fri, 28 Oct 2022 10:05:02 +0000 (11:05 +0100)
author Stephen Thomas <Stephen.Thomas@amd.com>
Fri, 28 Oct 2022 09:59:03 +0000 (10:59 +0100)
committer Jay Foad <jay.foad@amd.com>
Fri, 28 Oct 2022 10:05:02 +0000 (11:05 +0100)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

index e760575..81013db 100644 (file)
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1332,6 +1332,12 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
    return false;
  }
  
+static bool isStoreCountWaitZero(const MachineInstr &I) {
+  return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+         I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+         !I.getOperand(1).getImm();
+}
+
  bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
    if (!RunLdsBranchVmemWARHazardFixup)
      return false;
@@ -1351,9 +1357,7 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
      return false;
  
    auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
-    return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
-                               I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
-                               !I.getOperand(1).getImm());
+    return IsHazardInst(I) || isStoreCountWaitZero(I);
    };
  
    auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
@@ -1370,9 +1374,7 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
        if (InstType == InstType2)
          return true;
  
-      return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
-             I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
-             !I.getOperand(1).getImm();
+      return isStoreCountWaitZero(I);
      };
  
      return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

index 5b352ac..5e0ae4c 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -211,19 +211,20 @@ public:
      return ScoreUBs[T];
    }
  
+  unsigned getScoreRange(InstCounterType T) const {
+    return getScoreUB(T) - getScoreLB(T);
+  }
+
    // Mapping from event to counter.
    InstCounterType eventCounter(WaitEventType E) {
-    if (WaitEventMaskForInst[VM_CNT] & (1 << E))
-      return VM_CNT;
-    if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
-      return LGKM_CNT;
-    if (WaitEventMaskForInst[VS_CNT] & (1 << E))
-      return VS_CNT;
-    assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
-    return EXP_CNT;
+    for (auto T : inst_counter_types()) {
+      if (WaitEventMaskForInst[T] & (1 << E))
+        return T;
+    }
+    llvm_unreachable("event type has no associated counter");
    }
  
-  unsigned getRegScore(int GprNo, InstCounterType T) {
+  unsigned getRegScore(int GprNo, InstCounterType T) const {
      if (GprNo < NUM_ALL_VGPRS) {
        return VgprScores[T][GprNo];
      }
@@ -240,21 +241,25 @@ public:
    bool counterOutOfOrder(InstCounterType T) const;
    void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
    void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-  void determineWait(InstCounterType T, unsigned ScoreToWait,
-                     AMDGPU::Waitcnt &Wait) const;
+  void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
    void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
    void applyWaitcnt(InstCounterType T, unsigned Count);
    void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
                       const MachineRegisterInfo *MRI, WaitEventType E,
                       MachineInstr &MI);
  
-  bool hasPending() const { return PendingEvents != 0; }
-  bool hasPendingEvent(WaitEventType E) const {
+  unsigned hasPendingEvent() const { return PendingEvents; }
+  unsigned hasPendingEvent(WaitEventType E) const {
      return PendingEvents & (1 << E);
    }
+  unsigned hasPendingEvent(InstCounterType T) const {
+    unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
+    assert((HasPending != 0) == (getScoreRange(T) != 0));
+    return HasPending;
+  }
  
    bool hasMixedPendingEvents(InstCounterType T) const {
-    unsigned Events = PendingEvents & WaitEventMaskForInst[T];
+    unsigned Events = hasPendingEvent(T);
      // Return true if more than one bit is set in Events.
      return Events & (Events - 1);
    }
@@ -304,11 +309,12 @@ private:
    void setScoreUB(InstCounterType T, unsigned Val) {
      assert(T < NUM_INST_CNTS);
      ScoreUBs[T] = Val;
-    if (T == EXP_CNT) {
-      unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
-      if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
-        ScoreLBs[T] = UB;
-    }
+
+    if (T != EXP_CNT)
+      return;
+
+    if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
+      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
    }
  
    void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
@@ -694,29 +700,30 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
  void WaitcntBrackets::print(raw_ostream &OS) {
    OS << '\n';
    for (auto T : inst_counter_types()) {
-    unsigned LB = getScoreLB(T);
-    unsigned UB = getScoreUB(T);
+    unsigned SR = getScoreRange(T);
  
      switch (T) {
      case VM_CNT:
-      OS << "    VM_CNT(" << UB - LB << "): ";
+      OS << "    VM_CNT(" << SR << "): ";
        break;
      case LGKM_CNT:
-      OS << "    LGKM_CNT(" << UB - LB << "): ";
+      OS << "    LGKM_CNT(" << SR << "): ";
        break;
      case EXP_CNT:
-      OS << "    EXP_CNT(" << UB - LB << "): ";
+      OS << "    EXP_CNT(" << SR << "): ";
        break;
      case VS_CNT:
-      OS << "    VS_CNT(" << UB - LB << "): ";
+      OS << "    VS_CNT(" << SR << "): ";
        break;
      default:
-      OS << "    UNKNOWN(" << UB - LB << "): ";
+      OS << "    UNKNOWN(" << SR << "): ";
        break;
      }
  
-    if (LB < UB) {
+    if (SR != 0) {
        // Print vgpr scores.
+      unsigned LB = getScoreLB(T);
+
        for (int J = 0; J <= VgprUB; J++) {
          unsigned RegScore = getRegScore(J, T);
          if (RegScore <= LB)
@@ -755,18 +762,17 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
  
  void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
                                        unsigned &Count) const {
-  const unsigned LB = getScoreLB(T);
-  const unsigned UB = getScoreUB(T);
-
    // The number of outstanding events for this type, T, can be calculated
    // as (UB - LB). If the current Count is greater than or equal to the number
    // of outstanding events, then the wait for this counter is redundant.
-  if (Count >= UB - LB)
+  if (Count >= getScoreRange(T))
      Count = ~0u;
  }
  
-void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
+void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
                                      AMDGPU::Waitcnt &Wait) const {
+  unsigned ScoreToWait = getRegScore(RegNo, T);
+
    // If the score of src_operand falls within the bracket, we need an
    // s_waitcnt instruction.
    const unsigned LB = getScoreLB(T);
@@ -1106,8 +1112,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
  
          for (int RegNo = CallAddrOpInterval.first;
               RegNo < CallAddrOpInterval.second; ++RegNo)
-          ScoreBrackets.determineWait(
-            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+          ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
  
          int RtnAddrOpIdx =
            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
@@ -1117,8 +1122,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
  
            for (int RegNo = RtnAddrOpInterval.first;
                 RegNo < RtnAddrOpInterval.second; ++RegNo)
-            ScoreBrackets.determineWait(
-              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+            ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
          }
        }
      } else {
@@ -1150,11 +1154,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
            continue;
          unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
          // VM_CNT is only relevant to vgpr or LDS.
-        ScoreBrackets.determineWait(
-            VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+        ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
          if (Memop->isStore()) {
-          ScoreBrackets.determineWait(
-              EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+          ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
          }
        }
  
@@ -1176,17 +1178,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
              if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
                  ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
                                                         getVmemType(MI))) {
-              ScoreBrackets.determineWait(
-                  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+              ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
                ScoreBrackets.clearVgprVmemTypes(RegNo);
              }
              if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
-              ScoreBrackets.determineWait(
-                  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+              ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
              }
            }
-          ScoreBrackets.determineWait(
-              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+          ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
          }
        }
      }
@@ -1205,9 +1204,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
    //       after fixing the scheduler. Also, the Shader Compiler code is
    //       independent of target.
    if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
-    if (ScoreBrackets.getScoreLB(LGKM_CNT) <
-            ScoreBrackets.getScoreUB(LGKM_CNT) &&
-        ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+    if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
        Wait.LgkmCnt = 0;
      }
    }
@@ -1228,9 +1225,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
      Wait.VsCnt = 0;
  
    if (FlushVmCnt) {
-    unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
-    unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
-    if (UB - LB != 0)
+    if (ScoreBrackets.hasPendingEvent(VM_CNT))
        Wait.VmCnt = 0;
    }
  
@@ -1245,9 +1240,7 @@ bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
                                                 MachineInstr *OldWaitcntInstr) {
    AMDGPU::Waitcnt Wait;
  
-  unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
-  unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
-  if (UB - LB == 0)
+  if (!ScoreBrackets.hasPendingEvent(VM_CNT))
      return false;
  
    Wait.VmCnt = 0;
@@ -1603,8 +1596,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
          // 2. Restore the correct value of vccz by writing the current value
          //    of vcc back to vcc.
          if (ST->hasReadVCCZBug() &&
-            ScoreBrackets.getScoreLB(LGKM_CNT) <
-                ScoreBrackets.getScoreUB(LGKM_CNT) &&
              ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
            // Writes to vcc while there's an outstanding smem read may get
            // clobbered as soon as any read completes.
@@ -1848,7 +1839,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
        Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
        BI.Dirty = false;
  
-      if (Brackets->hasPending()) {
+      if (Brackets->hasPendingEvent()) {
          BlockInfo *MoveBracketsToSucc = nullptr;
          for (MachineBasicBlock *Succ : BI.MBB->successors()) {
            auto SuccBII = BlockInfos.find(Succ);
author	Stephen Thomas <Stephen.Thomas@amd.com>
	Fri, 28 Oct 2022 09:59:03 +0000 (10:59 +0100)
committer	Jay Foad <jay.foad@amd.com>
	Fri, 28 Oct 2022 10:05:02 +0000 (11:05 +0100)
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp		patch \| blob \| history