From: Stephen Thomas Date: Fri, 28 Oct 2022 09:59:03 +0000 (+0100) Subject: [AMDGPU] Small cleanups in wait counter code X-Git-Tag: upstream/17.0.6~29226 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c8a90316fa75f57b3789c9346d482b379f628c09;p=platform%2Fupstream%2Fllvm.git [AMDGPU] Small cleanups in wait counter code A small number of cleanups and refactors intended to enhance readability in two passes that deal with s_waitcnt instructions. Differential Revision: https://reviews.llvm.org/D136677 --- diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index e760575..81013db 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1332,6 +1332,12 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, return false; } +static bool isStoreCountWaitZero(const MachineInstr &I) { + return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I.getOperand(1).getImm(); +} + bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { if (!RunLdsBranchVmemWARHazardFixup) return false; @@ -1351,9 +1357,7 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { return false; auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { - return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && - I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && - !I.getOperand(1).getImm()); + return IsHazardInst(I) || isStoreCountWaitZero(I); }; auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { @@ -1370,9 +1374,7 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { if (InstType == InstType2) return true; - return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && - I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && - !I.getOperand(1).getImm(); + return isStoreCountWaitZero(I); }; return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 5b352ac..5e0ae4c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -211,19 +211,20 @@ public: return ScoreUBs[T]; } + unsigned getScoreRange(InstCounterType T) const { + return getScoreUB(T) - getScoreLB(T); + } + // Mapping from event to counter. InstCounterType eventCounter(WaitEventType E) { - if (WaitEventMaskForInst[VM_CNT] & (1 << E)) - return VM_CNT; - if (WaitEventMaskForInst[LGKM_CNT] & (1 << E)) - return LGKM_CNT; - if (WaitEventMaskForInst[VS_CNT] & (1 << E)) - return VS_CNT; - assert(WaitEventMaskForInst[EXP_CNT] & (1 << E)); - return EXP_CNT; + for (auto T : inst_counter_types()) { + if (WaitEventMaskForInst[T] & (1 << E)) + return T; + } + llvm_unreachable("event type has no associated counter"); } - unsigned getRegScore(int GprNo, InstCounterType T) { + unsigned getRegScore(int GprNo, InstCounterType T) const { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } @@ -240,21 +241,25 @@ public: bool counterOutOfOrder(InstCounterType T) const; void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - void determineWait(InstCounterType T, unsigned ScoreToWait, - AMDGPU::Waitcnt &Wait) const; + void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const; void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &MI); - bool hasPending() const { return PendingEvents != 0; } - bool hasPendingEvent(WaitEventType E) const { + unsigned hasPendingEvent() const { return PendingEvents; } + unsigned hasPendingEvent(WaitEventType E) const { return PendingEvents & (1 << E); } + unsigned hasPendingEvent(InstCounterType T) const { + unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; + assert((HasPending != 0) == (getScoreRange(T) != 0)); + return HasPending; + } bool hasMixedPendingEvents(InstCounterType T) const { - unsigned Events = PendingEvents & WaitEventMaskForInst[T]; + unsigned Events = hasPendingEvent(T); // Return true if more than one bit is set in Events. return Events & (Events - 1); } @@ -304,11 +309,12 @@ private: void setScoreUB(InstCounterType T, unsigned Val) { assert(T < NUM_INST_CNTS); ScoreUBs[T] = Val; - if (T == EXP_CNT) { - unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); - if (ScoreLBs[T] < UB && UB < ScoreUBs[T]) - ScoreLBs[T] = UB; - } + + if (T != EXP_CNT) + return; + + if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); } void setRegScore(int GprNo, InstCounterType T, unsigned Val) { @@ -694,29 +700,30 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; for (auto T : inst_counter_types()) { - unsigned LB = getScoreLB(T); - unsigned UB = getScoreUB(T); + unsigned SR = getScoreRange(T); switch (T) { case VM_CNT: - OS << " VM_CNT(" << UB - LB << "): "; + OS << " VM_CNT(" << SR << "): "; break; case LGKM_CNT: - OS << " LGKM_CNT(" << UB - LB << "): "; + OS << " LGKM_CNT(" << SR << "): "; break; case EXP_CNT: - OS << " EXP_CNT(" << UB - LB << "): "; + OS << " EXP_CNT(" << SR << "): "; break; case VS_CNT: - OS << " VS_CNT(" << UB - LB << "): "; + OS << " VS_CNT(" << SR << "): "; break; default: - OS << " UNKNOWN(" << UB - LB << "): "; + OS << " UNKNOWN(" << SR << "): "; break; } - if (LB < UB) { + if (SR != 0) { // Print vgpr scores. + unsigned LB = getScoreLB(T); + for (int J = 0; J <= VgprUB; J++) { unsigned RegScore = getRegScore(J, T); if (RegScore <= LB) @@ -755,18 +762,17 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count) const { - const unsigned LB = getScoreLB(T); - const unsigned UB = getScoreUB(T); - // The number of outstanding events for this type, T, can be calculated // as (UB - LB). If the current Count is greater than or equal to the number // of outstanding events, then the wait for this counter is redundant. - if (Count >= UB - LB) + if (Count >= getScoreRange(T)) Count = ~0u; } -void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait, +void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const { + unsigned ScoreToWait = getRegScore(RegNo, T); + // If the score of src_operand falls within the bracket, we need an // s_waitcnt instruction. const unsigned LB = getScoreLB(T); @@ -1106,8 +1112,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, for (int RegNo = CallAddrOpInterval.first; RegNo < CallAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); int RtnAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); @@ -1117,8 +1122,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, for (int RegNo = RtnAddrOpInterval.first; RegNo < RtnAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); } } } else { @@ -1150,11 +1154,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); if (Memop->isStore()) { - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } } @@ -1176,17 +1178,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Op.isUse() || !SIInstrInfo::isVMEM(MI) || ScoreBrackets.hasOtherPendingVmemTypes(RegNo, getVmemType(MI))) { - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); ScoreBrackets.clearVgprVmemTypes(RegNo); } if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } } - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); } } } @@ -1205,9 +1204,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { - if (ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && - ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { Wait.LgkmCnt = 0; } } @@ -1228,9 +1225,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait.VsCnt = 0; if (FlushVmCnt) { - unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); - unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); - if (UB - LB != 0) + if (ScoreBrackets.hasPendingEvent(VM_CNT)) Wait.VmCnt = 0; } @@ -1245,9 +1240,7 @@ bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, MachineInstr *OldWaitcntInstr) { AMDGPU::Waitcnt Wait; - unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); - unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); - if (UB - LB == 0) + if (!ScoreBrackets.hasPendingEvent(VM_CNT)) return false; Wait.VmCnt = 0; @@ -1603,8 +1596,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // 2. Restore the correct value of vccz by writing the current value // of vcc back to vcc. if (ST->hasReadVCCZBug() && - ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { // Writes to vcc while there's an outstanding smem read may get // clobbered as soon as any read completes. @@ -1848,7 +1839,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); BI.Dirty = false; - if (Brackets->hasPending()) { + if (Brackets->hasPendingEvent()) { BlockInfo *MoveBracketsToSucc = nullptr; for (MachineBasicBlock *Succ : BI.MBB->successors()) { auto SuccBII = BlockInfos.find(Succ);