return ScoreUBs[T];
}
+ unsigned getScoreRange(InstCounterType T) const {
+ return getScoreUB(T) - getScoreLB(T);
+ }
+
// Mapping from event to counter.
InstCounterType eventCounter(WaitEventType E) {
- if (WaitEventMaskForInst[VM_CNT] & (1 << E))
- return VM_CNT;
- if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
- return LGKM_CNT;
- if (WaitEventMaskForInst[VS_CNT] & (1 << E))
- return VS_CNT;
- assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
- return EXP_CNT;
+ for (auto T : inst_counter_types()) {
+ if (WaitEventMaskForInst[T] & (1 << E))
+ return T;
+ }
+ llvm_unreachable("event type has no associated counter");
}
- unsigned getRegScore(int GprNo, InstCounterType T) {
+ unsigned getRegScore(int GprNo, InstCounterType T) const {
if (GprNo < NUM_ALL_VGPRS) {
return VgprScores[T][GprNo];
}
bool counterOutOfOrder(InstCounterType T) const;
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
- void determineWait(InstCounterType T, unsigned ScoreToWait,
- AMDGPU::Waitcnt &Wait) const;
+ void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, WaitEventType E,
MachineInstr &MI);
- bool hasPending() const { return PendingEvents != 0; }
- bool hasPendingEvent(WaitEventType E) const {
+ unsigned hasPendingEvent() const { return PendingEvents; }
+ unsigned hasPendingEvent(WaitEventType E) const {
return PendingEvents & (1 << E);
}
+ unsigned hasPendingEvent(InstCounterType T) const {
+ unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
+ assert((HasPending != 0) == (getScoreRange(T) != 0));
+ return HasPending;
+ }
bool hasMixedPendingEvents(InstCounterType T) const {
- unsigned Events = PendingEvents & WaitEventMaskForInst[T];
+ unsigned Events = hasPendingEvent(T);
// Return true if more than one bit is set in Events.
return Events & (Events - 1);
}
void setScoreUB(InstCounterType T, unsigned Val) {
assert(T < NUM_INST_CNTS);
ScoreUBs[T] = Val;
- if (T == EXP_CNT) {
- unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
- if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
- ScoreLBs[T] = UB;
- }
+
+ if (T != EXP_CNT)
+ return;
+
+ if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
}
void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
void WaitcntBrackets::print(raw_ostream &OS) {
OS << '\n';
for (auto T : inst_counter_types()) {
- unsigned LB = getScoreLB(T);
- unsigned UB = getScoreUB(T);
+ unsigned SR = getScoreRange(T);
switch (T) {
case VM_CNT:
- OS << " VM_CNT(" << UB - LB << "): ";
+ OS << " VM_CNT(" << SR << "): ";
break;
case LGKM_CNT:
- OS << " LGKM_CNT(" << UB - LB << "): ";
+ OS << " LGKM_CNT(" << SR << "): ";
break;
case EXP_CNT:
- OS << " EXP_CNT(" << UB - LB << "): ";
+ OS << " EXP_CNT(" << SR << "): ";
break;
case VS_CNT:
- OS << " VS_CNT(" << UB - LB << "): ";
+ OS << " VS_CNT(" << SR << "): ";
break;
default:
- OS << " UNKNOWN(" << UB - LB << "): ";
+ OS << " UNKNOWN(" << SR << "): ";
break;
}
- if (LB < UB) {
+ if (SR != 0) {
// Print vgpr scores.
+ unsigned LB = getScoreLB(T);
+
for (int J = 0; J <= VgprUB; J++) {
unsigned RegScore = getRegScore(J, T);
if (RegScore <= LB)
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
unsigned &Count) const {
- const unsigned LB = getScoreLB(T);
- const unsigned UB = getScoreUB(T);
-
// The number of outstanding events for this type, T, can be calculated
// as (UB - LB). If the current Count is greater than or equal to the number
// of outstanding events, then the wait for this counter is redundant.
- if (Count >= UB - LB)
+ if (Count >= getScoreRange(T))
Count = ~0u;
}
-void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
+void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
AMDGPU::Waitcnt &Wait) const {
+ unsigned ScoreToWait = getRegScore(RegNo, T);
+
// If the score of src_operand falls within the bracket, we need an
// s_waitcnt instruction.
const unsigned LB = getScoreLB(T);
for (int RegNo = CallAddrOpInterval.first;
RegNo < CallAddrOpInterval.second; ++RegNo)
- ScoreBrackets.determineWait(
- LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+ ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
int RtnAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
for (int RegNo = RtnAddrOpInterval.first;
RegNo < RtnAddrOpInterval.second; ++RegNo)
- ScoreBrackets.determineWait(
- LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+ ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
}
}
} else {
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
if (Memop->isStore()) {
- ScoreBrackets.determineWait(
- EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+ ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
}
}
if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
getVmemType(MI))) {
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
ScoreBrackets.clearVgprVmemTypes(RegNo);
}
if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
- ScoreBrackets.determineWait(
- EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+ ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
}
}
- ScoreBrackets.determineWait(
- LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+ ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
}
}
}
// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.
if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
- if (ScoreBrackets.getScoreLB(LGKM_CNT) <
- ScoreBrackets.getScoreUB(LGKM_CNT) &&
- ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+ if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
Wait.LgkmCnt = 0;
}
}
Wait.VsCnt = 0;
if (FlushVmCnt) {
- unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
- unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
- if (UB - LB != 0)
+ if (ScoreBrackets.hasPendingEvent(VM_CNT))
Wait.VmCnt = 0;
}
MachineInstr *OldWaitcntInstr) {
AMDGPU::Waitcnt Wait;
- unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
- unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
- if (UB - LB == 0)
+ if (!ScoreBrackets.hasPendingEvent(VM_CNT))
return false;
Wait.VmCnt = 0;
// 2. Restore the correct value of vccz by writing the current value
// of vcc back to vcc.
if (ST->hasReadVCCZBug() &&
- ScoreBrackets.getScoreLB(LGKM_CNT) <
- ScoreBrackets.getScoreUB(LGKM_CNT) &&
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
// Writes to vcc while there's an outstanding smem read may get
// clobbered as soon as any read completes.
Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
BI.Dirty = false;
- if (Brackets->hasPending()) {
+ if (Brackets->hasPendingEvent()) {
BlockInfo *MoveBracketsToSucc = nullptr;
for (MachineBasicBlock *Succ : BI.MBB->successors()) {
auto SuccBII = BlockInfos.find(Succ);